50 files changed, 852 insertions, 279 deletions
diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h
index 06fd40851..1f656ffa8 100644
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@@ -55,6 +55,19 @@ constexpr u32 NUM_STORAGE_BUFFERS = 16;
 constexpr u32 NUM_TEXTURE_BUFFERS = 16;
 constexpr u32 NUM_STAGES = 5;
 
+enum class ObtainBufferSynchronize : u32 {
+    NoSynchronize = 0,
+    FullSynchronize = 1,
+    SynchronizeNoDirty = 2,
+};
+
+enum class ObtainBufferOperation : u32 {
+    DoNothing = 0,
+    MarkAsWritten = 1,
+    DiscardWrite = 2,
+    MarkQuery = 3,
+};
+
 using UniformBufferSizes = std::array<std::array<u32, NUM_GRAPHICS_UNIFORM_BUFFERS>, NUM_STAGES>;
 using ComputeUniformBufferSizes = std::array<u32, NUM_COMPUTE_UNIFORM_BUFFERS>;
 
@@ -191,6 +204,10 @@ public:
 
     bool DMAClear(GPUVAddr src_address, u64 amount, u32 value);
 
+    [[nodiscard]] std::pair<Buffer*, u32> ObtainBuffer(GPUVAddr gpu_addr, u32 size,
+                                                       ObtainBufferSynchronize sync_info,
+                                                       ObtainBufferOperation post_op);
+
     /// Return true when a CPU region is modified from the GPU
     [[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size);
 
@@ -366,7 +383,8 @@ private:
 
     void NotifyBufferDeletion();
 
-    [[nodiscard]] Binding StorageBufferBinding(GPUVAddr ssbo_addr, bool is_written = false) const;
+    [[nodiscard]] Binding StorageBufferBinding(GPUVAddr ssbo_addr, u32 cbuf_index,
+                                               bool is_written = false) const;
 
     [[nodiscard]] TextureBufferBinding GetTextureBufferBinding(GPUVAddr gpu_addr, u32 size,
                                                                PixelFormat format);
@@ -642,6 +660,42 @@ bool BufferCache<P>::DMAClear(GPUVAddr dst_address, u64 amount, u32 value) {
 }
 
 template <class P>
+std::pair<typename P::Buffer*, u32> BufferCache<P>::ObtainBuffer(GPUVAddr gpu_addr, u32 size,
+                                                                 ObtainBufferSynchronize sync_info,
+                                                                 ObtainBufferOperation post_op) {
+    const std::optional<VAddr> cpu_addr = gpu_memory->GpuToCpuAddress(gpu_addr);
+    if (!cpu_addr) {
+        return {&slot_buffers[NULL_BUFFER_ID], 0};
+    }
+    const BufferId buffer_id = FindBuffer(*cpu_addr, size);
+    Buffer& buffer = slot_buffers[buffer_id];
+
+    // synchronize op
+    switch (sync_info) {
+    case ObtainBufferSynchronize::FullSynchronize:
+        SynchronizeBuffer(buffer, *cpu_addr, size);
+        break;
+    default:
+        break;
+    }
+
+    switch (post_op) {
+    case ObtainBufferOperation::MarkAsWritten:
+        MarkWrittenBuffer(buffer_id, *cpu_addr, size);
+        break;
+    case ObtainBufferOperation::DiscardWrite: {
+        IntervalType interval{*cpu_addr, size};
+        ClearDownload(interval);
+        break;
+    }
+    default:
+        break;
+    }
+
+    return {&buffer, buffer.Offset(*cpu_addr)};
+}
+
+template <class P>
 void BufferCache<P>::BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr,
                                                u32 size) {
     const std::optional<VAddr> cpu_addr = gpu_memory->GpuToCpuAddress(gpu_addr);
@@ -749,7 +803,7 @@ void BufferCache<P>::BindGraphicsStorageBuffer(size_t stage, size_t ssbo_index,
 
     const auto& cbufs = maxwell3d->state.shader_stages[stage];
     const GPUVAddr ssbo_addr = cbufs.const_buffers[cbuf_index].address + cbuf_offset;
-    storage_buffers[stage][ssbo_index] = StorageBufferBinding(ssbo_addr, is_written);
+    storage_buffers[stage][ssbo_index] = StorageBufferBinding(ssbo_addr, cbuf_index, is_written);
 }
 
 template <class P>
@@ -789,7 +843,7 @@ void BufferCache<P>::BindComputeStorageBuffer(size_t ssbo_index, u32 cbuf_index,
 
     const auto& cbufs = launch_desc.const_buffer_config;
     const GPUVAddr ssbo_addr = cbufs[cbuf_index].Address() + cbuf_offset;
-    compute_storage_buffers[ssbo_index] = StorageBufferBinding(ssbo_addr, is_written);
+    compute_storage_buffers[ssbo_index] = StorageBufferBinding(ssbo_addr, cbuf_index, is_written);
 }
 
 template <class P>
@@ -1935,11 +1989,26 @@ void BufferCache<P>::NotifyBufferDeletion() {
 
 template <class P>
 typename BufferCache<P>::Binding BufferCache<P>::StorageBufferBinding(GPUVAddr ssbo_addr,
+                                                                      u32 cbuf_index,
                                                                       bool is_written) const {
     const GPUVAddr gpu_addr = gpu_memory->Read<u64>(ssbo_addr);
-    const u32 size = gpu_memory->Read<u32>(ssbo_addr + 8);
+    const auto size = [&]() {
+        const bool is_nvn_cbuf = cbuf_index == 0;
+        // The NVN driver buffer (index 0) is known to pack the SSBO address followed by its size.
+        if (is_nvn_cbuf) {
+            return gpu_memory->Read<u32>(ssbo_addr + 8);
+        }
+        // Other titles (notably Doom Eternal) may use STG/LDG on buffer addresses in custom defined
+        // cbufs, which do not store the sizes adjacent to the addresses, so use the fully
+        // mapped buffer size for now.
+        const u32 memory_layout_size = static_cast<u32>(gpu_memory->GetMemoryLayoutSize(gpu_addr));
+        LOG_INFO(HW_GPU, "Binding storage buffer for cbuf index {}, MemoryLayoutSize 0x{:X}",
+                 cbuf_index, memory_layout_size);
+        return memory_layout_size;
+    }();
     const std::optional<VAddr> cpu_addr = gpu_memory->GpuToCpuAddress(gpu_addr);
     if (!cpu_addr || size == 0) {
+        LOG_WARNING(HW_GPU, "Failed to find storage buffer for cbuf index {}", cbuf_index);
         return NULL_BINDING;
     }
     const VAddr cpu_end = Common::AlignUp(*cpu_addr + size, Core::Memory::YUZU_PAGESIZE);
diff --git a/src/video_core/control/channel_state_cache.h b/src/video_core/control/channel_state_cache.h
index cdaf4f8d5..46bc9e322 100644
--- a/src/video_core/control/channel_state_cache.h
+++ b/src/video_core/control/channel_state_cache.h
@@ -44,7 +44,7 @@ public:
 template <class P>
 class ChannelSetupCaches {
 public:
-    /// Operations for seting the channel of execution.
+    /// Operations for setting the channel of execution.
     virtual ~ChannelSetupCaches();
 
     /// Create channel state.
diff --git a/src/video_core/engines/draw_manager.cpp b/src/video_core/engines/draw_manager.cpp
index 1d22d25f1..0e94c521a 100644
--- a/src/video_core/engines/draw_manager.cpp
+++ b/src/video_core/engines/draw_manager.cpp
@@ -164,6 +164,7 @@ void DrawManager::DrawEnd(u32 instance_count, bool force_draw) {
         draw_state.index_buffer.count =
             static_cast<u32>(draw_state.inline_index_draw_indexes.size() / 4);
         draw_state.index_buffer.format = Maxwell3D::Regs::IndexFormat::UnsignedInt;
+        maxwell3d->dirty.flags[VideoCommon::Dirty::IndexBuffer] = true;
         ProcessDraw(true, instance_count);
         draw_state.inline_index_draw_indexes.clear();
         break;
diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp
index 7762c7d96..e68850dc5 100644
--- a/src/video_core/engines/maxwell_dma.cpp
+++ b/src/video_core/engines/maxwell_dma.cpp
@@ -14,7 +14,13 @@
 #include "video_core/textures/decoders.h"
 
 MICROPROFILE_DECLARE(GPU_DMAEngine);
+MICROPROFILE_DECLARE(GPU_DMAEngineBL);
+MICROPROFILE_DECLARE(GPU_DMAEngineLB);
+MICROPROFILE_DECLARE(GPU_DMAEngineBB);
 MICROPROFILE_DEFINE(GPU_DMAEngine, "GPU", "DMA Engine", MP_RGB(224, 224, 128));
+MICROPROFILE_DEFINE(GPU_DMAEngineBL, "GPU", "DMA Engine Block - Linear", MP_RGB(224, 224, 128));
+MICROPROFILE_DEFINE(GPU_DMAEngineLB, "GPU", "DMA Engine Linear - Block", MP_RGB(224, 224, 128));
+MICROPROFILE_DEFINE(GPU_DMAEngineBB, "GPU", "DMA Engine Block - Block", MP_RGB(224, 224, 128));
 
 namespace Tegra::Engines {
 
@@ -72,6 +78,7 @@ void MaxwellDMA::Launch() {
         memory_manager.FlushCaching();
         if (!is_src_pitch && !is_dst_pitch) {
             // If both the source and the destination are in block layout, assert.
+            MICROPROFILE_SCOPE(GPU_DMAEngineBB);
             CopyBlockLinearToBlockLinear();
             ReleaseSemaphore();
             return;
@@ -87,8 +94,10 @@ void MaxwellDMA::Launch() {
             }
         } else {
             if (!is_src_pitch && is_dst_pitch) {
+                MICROPROFILE_SCOPE(GPU_DMAEngineBL);
                 CopyBlockLinearToPitch();
             } else {
+                MICROPROFILE_SCOPE(GPU_DMAEngineLB);
                 CopyPitchToBlockLinear();
             }
         }
@@ -153,21 +162,35 @@ void MaxwellDMA::Launch() {
 }
 
 void MaxwellDMA::CopyBlockLinearToPitch() {
-    UNIMPLEMENTED_IF(regs.src_params.block_size.width != 0);
-    UNIMPLEMENTED_IF(regs.src_params.layer != 0);
-
-    const bool is_remapping = regs.launch_dma.remap_enable != 0;
-
-    // Optimized path for micro copies.
-    const size_t dst_size = static_cast<size_t>(regs.pitch_out) * regs.line_count;
-    if (!is_remapping && dst_size < GOB_SIZE && regs.pitch_out <= GOB_SIZE_X &&
-        regs.src_params.height > GOB_SIZE_Y) {
-        FastCopyBlockLinearToPitch();
+    UNIMPLEMENTED_IF(regs.launch_dma.remap_enable != 0);
+
+    u32 bytes_per_pixel = 1;
+    DMA::ImageOperand src_operand;
+    src_operand.bytes_per_pixel = bytes_per_pixel;
+    src_operand.params = regs.src_params;
+    src_operand.address = regs.offset_in;
+
+    DMA::BufferOperand dst_operand;
+    dst_operand.pitch = regs.pitch_out;
+    dst_operand.width = regs.line_length_in;
+    dst_operand.height = regs.line_count;
+    dst_operand.address = regs.offset_out;
+    DMA::ImageCopy copy_info{};
+    copy_info.length_x = regs.line_length_in;
+    copy_info.length_y = regs.line_count;
+    auto& accelerate = rasterizer->AccessAccelerateDMA();
+    if (accelerate.ImageToBuffer(copy_info, src_operand, dst_operand)) {
         return;
     }
 
+    UNIMPLEMENTED_IF(regs.src_params.block_size.width != 0);
+    UNIMPLEMENTED_IF(regs.src_params.block_size.depth != 0);
+    UNIMPLEMENTED_IF(regs.src_params.block_size.depth == 0 && regs.src_params.depth != 1);
+
     // Deswizzle the input and copy it over.
-    const Parameters& src_params = regs.src_params;
+    const DMA::Parameters& src_params = regs.src_params;
+
+    const bool is_remapping = regs.launch_dma.remap_enable != 0;
 
     const u32 num_remap_components = regs.remap_const.num_dst_components_minus_one + 1;
     const u32 remap_components_size = regs.remap_const.component_size_minus_one + 1;
@@ -187,7 +210,7 @@ void MaxwellDMA::CopyBlockLinearToPitch() {
         x_offset >>= bpp_shift;
     }
 
-    const u32 bytes_per_pixel = base_bpp << bpp_shift;
+    bytes_per_pixel = base_bpp << bpp_shift;
     const u32 height = src_params.height;
     const u32 depth = src_params.depth;
     const u32 block_height = src_params.block_size.height;
@@ -195,11 +218,12 @@ void MaxwellDMA::CopyBlockLinearToPitch() {
     const size_t src_size =
         CalculateSize(true, bytes_per_pixel, width, height, depth, block_height, block_depth);
 
+    const size_t dst_size = static_cast<size_t>(regs.pitch_out) * regs.line_count;
     read_buffer.resize_destructive(src_size);
     write_buffer.resize_destructive(dst_size);
 
-    memory_manager.ReadBlock(regs.offset_in, read_buffer.data(), src_size);
-    memory_manager.ReadBlock(regs.offset_out, write_buffer.data(), dst_size);
+    memory_manager.ReadBlock(src_operand.address, read_buffer.data(), src_size);
+    memory_manager.ReadBlockUnsafe(dst_operand.address, write_buffer.data(), dst_size);
 
     UnswizzleSubrect(write_buffer, read_buffer, bytes_per_pixel, width, height, depth, x_offset,
                      src_params.origin.y, x_elements, regs.line_count, block_height, block_depth,
@@ -216,6 +240,24 @@ void MaxwellDMA::CopyPitchToBlockLinear() {
     const u32 num_remap_components = regs.remap_const.num_dst_components_minus_one + 1;
     const u32 remap_components_size = regs.remap_const.component_size_minus_one + 1;
 
+    u32 bytes_per_pixel = 1;
+    DMA::ImageOperand dst_operand;
+    dst_operand.bytes_per_pixel = bytes_per_pixel;
+    dst_operand.params = regs.dst_params;
+    dst_operand.address = regs.offset_out;
+    DMA::BufferOperand src_operand;
+    src_operand.pitch = regs.pitch_in;
+    src_operand.width = regs.line_length_in;
+    src_operand.height = regs.line_count;
+    src_operand.address = regs.offset_in;
+    DMA::ImageCopy copy_info{};
+    copy_info.length_x = regs.line_length_in;
+    copy_info.length_y = regs.line_count;
+    auto& accelerate = rasterizer->AccessAccelerateDMA();
+    if (accelerate.BufferToImage(copy_info, src_operand, dst_operand)) {
+        return;
+    }
+
     const auto& dst_params = regs.dst_params;
 
     const u32 base_bpp = !is_remapping ? 1U : num_remap_components * remap_components_size;
@@ -233,7 +275,7 @@ void MaxwellDMA::CopyPitchToBlockLinear() {
         x_offset >>= bpp_shift;
     }
 
-    const u32 bytes_per_pixel = base_bpp << bpp_shift;
+    bytes_per_pixel = base_bpp << bpp_shift;
     const u32 height = dst_params.height;
     const u32 depth = dst_params.depth;
     const u32 block_height = dst_params.block_size.height;
@@ -260,45 +302,14 @@ void MaxwellDMA::CopyPitchToBlockLinear() {
     memory_manager.WriteBlockCached(regs.offset_out, write_buffer.data(), dst_size);
 }
 
-void MaxwellDMA::FastCopyBlockLinearToPitch() {
-    const u32 bytes_per_pixel = 1U;
-    const size_t src_size = GOB_SIZE;
-    const size_t dst_size = static_cast<size_t>(regs.pitch_out) * regs.line_count;
-    u32 pos_x = regs.src_params.origin.x;
-    u32 pos_y = regs.src_params.origin.y;
-    const u64 offset = GetGOBOffset(regs.src_params.width, regs.src_params.height, pos_x, pos_y,
-                                    regs.src_params.block_size.height, bytes_per_pixel);
-    const u32 x_in_gob = 64 / bytes_per_pixel;
-    pos_x = pos_x % x_in_gob;
-    pos_y = pos_y % 8;
-
-    read_buffer.resize_destructive(src_size);
-    write_buffer.resize_destructive(dst_size);
-
-    if (Settings::IsGPULevelExtreme()) {
-        memory_manager.ReadBlock(regs.offset_in + offset, read_buffer.data(), src_size);
-        memory_manager.ReadBlock(regs.offset_out, write_buffer.data(), dst_size);
-    } else {
-        memory_manager.ReadBlockUnsafe(regs.offset_in + offset, read_buffer.data(), src_size);
-        memory_manager.ReadBlockUnsafe(regs.offset_out, write_buffer.data(), dst_size);
-    }
-
-    UnswizzleSubrect(write_buffer, read_buffer, bytes_per_pixel, regs.src_params.width,
-                     regs.src_params.height, 1, pos_x, pos_y, regs.line_length_in, regs.line_count,
-                     regs.src_params.block_size.height, regs.src_params.block_size.depth,
-                     regs.pitch_out);
-
-    memory_manager.WriteBlockCached(regs.offset_out, write_buffer.data(), dst_size);
-}
-
 void MaxwellDMA::CopyBlockLinearToBlockLinear() {
     UNIMPLEMENTED_IF(regs.src_params.block_size.width != 0);
 
     const bool is_remapping = regs.launch_dma.remap_enable != 0;
 
     // Deswizzle the input and copy it over.
-    const Parameters& src = regs.src_params;
-    const Parameters& dst = regs.dst_params;
+    const DMA::Parameters& src = regs.src_params;
+    const DMA::Parameters& dst = regs.dst_params;
 
     const u32 num_remap_components = regs.remap_const.num_dst_components_minus_one + 1;
     const u32 remap_components_size = regs.remap_const.component_size_minus_one + 1;
diff --git a/src/video_core/engines/maxwell_dma.h b/src/video_core/engines/maxwell_dma.h
index 0e594fa74..69e26cb32 100644
--- a/src/video_core/engines/maxwell_dma.h
+++ b/src/video_core/engines/maxwell_dma.h
@@ -24,6 +24,54 @@ namespace VideoCore {
 class RasterizerInterface;
 }
 
+namespace Tegra {
+namespace DMA {
+
+union Origin {
+    BitField<0, 16, u32> x;
+    BitField<16, 16, u32> y;
+};
+static_assert(sizeof(Origin) == 4);
+
+struct ImageCopy {
+    u32 length_x{};
+    u32 length_y{};
+};
+
+union BlockSize {
+    BitField<0, 4, u32> width;
+    BitField<4, 4, u32> height;
+    BitField<8, 4, u32> depth;
+    BitField<12, 4, u32> gob_height;
+};
+static_assert(sizeof(BlockSize) == 4);
+
+struct Parameters {
+    BlockSize block_size;
+    u32 width;
+    u32 height;
+    u32 depth;
+    u32 layer;
+    Origin origin;
+};
+static_assert(sizeof(Parameters) == 24);
+
+struct ImageOperand {
+    u32 bytes_per_pixel;
+    Parameters params;
+    GPUVAddr address;
+};
+
+struct BufferOperand {
+    u32 pitch;
+    u32 width;
+    u32 height;
+    GPUVAddr address;
+};
+
+} // namespace DMA
+} // namespace Tegra
+
 namespace Tegra::Engines {
 
 class AccelerateDMAInterface {
@@ -32,6 +80,12 @@ public:
     virtual bool BufferCopy(GPUVAddr src_address, GPUVAddr dest_address, u64 amount) = 0;
 
     virtual bool BufferClear(GPUVAddr src_address, u64 amount, u32 value) = 0;
+
+    virtual bool ImageToBuffer(const DMA::ImageCopy& copy_info, const DMA::ImageOperand& src,
+                               const DMA::BufferOperand& dst) = 0;
+
+    virtual bool BufferToImage(const DMA::ImageCopy& copy_info, const DMA::BufferOperand& src,
+                               const DMA::ImageOperand& dst) = 0;
 };
 
 /**
@@ -51,30 +105,6 @@ public:
         }
     };
 
-    union BlockSize {
-        BitField<0, 4, u32> width;
-        BitField<4, 4, u32> height;
-        BitField<8, 4, u32> depth;
-        BitField<12, 4, u32> gob_height;
-    };
-    static_assert(sizeof(BlockSize) == 4);
-
-    union Origin {
-        BitField<0, 16, u32> x;
-        BitField<16, 16, u32> y;
-    };
-    static_assert(sizeof(Origin) == 4);
-
-    struct Parameters {
-        BlockSize block_size;
-        u32 width;
-        u32 height;
-        u32 depth;
-        u32 layer;
-        Origin origin;
-    };
-    static_assert(sizeof(Parameters) == 24);
-
     struct Semaphore {
         PackedGPUVAddr address;
         u32 payload;
@@ -227,8 +257,6 @@ private:
 
     void CopyBlockLinearToBlockLinear();
 
-    void FastCopyBlockLinearToPitch();
-
     void ReleaseSemaphore();
 
     void ConsumeSinkImpl() override;
@@ -261,17 +289,17 @@ private:
                 u32 reserved05[0x3f];
                 PackedGPUVAddr offset_in;
                 PackedGPUVAddr offset_out;
-                u32 pitch_in;
-                u32 pitch_out;
+                s32 pitch_in;
+                s32 pitch_out;
                 u32 line_length_in;
                 u32 line_count;
                 u32 reserved06[0xb6];
                 u32 remap_consta_value;
                 u32 remap_constb_value;
                 RemapConst remap_const;
-                Parameters dst_params;
+                DMA::Parameters dst_params;
                 u32 reserved07[0x1];
-                Parameters src_params;
+                DMA::Parameters src_params;
                 u32 reserved08[0x275];
                 u32 pm_trigger_end;
                 u32 reserved09[0x3ba];
diff --git a/src/video_core/engines/sw_blitter/blitter.cpp b/src/video_core/engines/sw_blitter/blitter.cpp
index 2f1ea4626..3c9f38559 100644
--- a/src/video_core/engines/sw_blitter/blitter.cpp
+++ b/src/video_core/engines/sw_blitter/blitter.cpp
@@ -193,7 +193,7 @@ bool SoftwareBlitEngine::Blit(Fermi2D::Surface& src, Fermi2D::Surface& dst,
         output_converter->ConvertFrom(impl->intermediate_dst, impl->dst_buffer);
     };
 
-    // Do actuall Blit
+    // Do actual Blit
 
     impl->dst_buffer.resize(dst_copy_size);
     if (src.linear == Fermi2D::MemoryLayout::BlockLinear) {
diff --git a/src/video_core/framebuffer_config.h b/src/video_core/framebuffer_config.h
index d93f5a37f..5f3bffcab 100644
--- a/src/video_core/framebuffer_config.h
+++ b/src/video_core/framebuffer_config.h
@@ -5,8 +5,8 @@
 
 #include "common/common_types.h"
 #include "common/math_util.h"
-#include "core/hle/service/nvflinger/buffer_transform_flags.h"
-#include "core/hle/service/nvflinger/pixel_format.h"
+#include "core/hle/service/nvnflinger/buffer_transform_flags.h"
+#include "core/hle/service/nvnflinger/pixel_format.h"
 
 namespace Tegra {
 
diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp
index 7024a19cf..2e7f9c5ed 100644
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -197,7 +197,7 @@ struct GPU::Impl {
         constexpr u64 gpu_ticks_num = 384;
         constexpr u64 gpu_ticks_den = 625;
 
-        u64 nanoseconds = system.CoreTiming().GetGlobalTimeNs().count();
+        u64 nanoseconds = system.CoreTiming().GetCPUTimeNs().count();
         if (Settings::values.use_fast_gpu_time.GetValue()) {
             nanoseconds /= 256;
         }
diff --git a/src/video_core/gpu_thread.cpp b/src/video_core/gpu_thread.cpp
index 7cc5647e9..f52f9e28f 100644
--- a/src/video_core/gpu_thread.cpp
+++ b/src/video_core/gpu_thread.cpp
@@ -25,7 +25,7 @@ static void RunThread(std::stop_token stop_token, Core::System& system,
     SCOPE_EXIT({ MicroProfileOnThreadExit(); });
 
     Common::SetCurrentThreadName(name.c_str());
-    Common::SetCurrentThreadPriority(Common::ThreadPriority::High);
+    Common::SetCurrentThreadPriority(Common::ThreadPriority::Critical);
     system.RegisterHostThread();
 
     auto current_context = context.Acquire();
diff --git a/src/video_core/host_shaders/astc_decoder.comp b/src/video_core/host_shaders/astc_decoder.comp
index d608678a3..bf2693559 100644
--- a/src/video_core/host_shaders/astc_decoder.comp
+++ b/src/video_core/host_shaders/astc_decoder.comp
@@ -125,7 +125,7 @@ uvec4 local_buff;
 uvec4 color_endpoint_data;
 int color_bitsread = 0;
 
-// Four values, two endpoints, four maximum paritions
+// Four values, two endpoints, four maximum partitions
 uint color_values[32];
 int colvals_index = 0;
 
diff --git a/src/video_core/host_shaders/opengl_smaa.glsl b/src/video_core/host_shaders/opengl_smaa.glsl
index 3cbe87bbf..419f89bca 100644
--- a/src/video_core/host_shaders/opengl_smaa.glsl
+++ b/src/video_core/host_shaders/opengl_smaa.glsl
@@ -97,7 +97,7 @@
  *     half-rate linear filtering on GCN.
  *
  *     If SMAA is applied to 64-bit color buffers, switching to point filtering
- *     when accesing them will increase the performance. Search for
+ *     when accessing them will increase the performance. Search for
  *     'SMAASamplePoint' to see which textures may benefit from point
  *     filtering, and where (which is basically the color input in the edge
  *     detection and resolve passes).
diff --git a/src/video_core/memory_manager.h b/src/video_core/memory_manager.h
index cf56392ef..51ae2de68 100644
--- a/src/video_core/memory_manager.h
+++ b/src/video_core/memory_manager.h
@@ -103,8 +103,8 @@ public:
 
     /**
      * Returns a vector with all the subranges of cpu addresses mapped beneath.
-     * if the region is continous, a single pair will be returned. If it's unmapped, an empty vector
-     * will be returned;
+     * if the region is continuous, a single pair will be returned. If it's unmapped, an empty
+     * vector will be returned;
      */
     std::vector<std::pair<GPUVAddr, std::size_t>> GetSubmappedRange(GPUVAddr gpu_addr,
                                                                     std::size_t size) const;
diff --git a/src/video_core/query_cache.h b/src/video_core/query_cache.h
index 00ce53e3e..8906ba6d8 100644
--- a/src/video_core/query_cache.h
+++ b/src/video_core/query_cache.h
@@ -341,7 +341,7 @@ public:
 
     /// Flushes the query to guest memory.
     virtual void Flush() {
-        // When counter is nullptr it means that it's just been reseted. We are supposed to write a
+        // When counter is nullptr it means that it's just been reset. We are supposed to write a
         // zero in these cases.
         const u64 value = counter ? counter->Query() : 0;
         std::memcpy(host_ptr, &value, sizeof(u64));
diff --git a/src/video_core/renderer_null/null_rasterizer.h b/src/video_core/renderer_null/null_rasterizer.h
index 51f896e43..0c59e6a1f 100644
--- a/src/video_core/renderer_null/null_rasterizer.h
+++ b/src/video_core/renderer_null/null_rasterizer.h
@@ -22,6 +22,14 @@ public:
     explicit AccelerateDMA();
     bool BufferCopy(GPUVAddr start_address, GPUVAddr end_address, u64 amount) override;
     bool BufferClear(GPUVAddr src_address, u64 amount, u32 value) override;
+    bool ImageToBuffer(const Tegra::DMA::ImageCopy& copy_info, const Tegra::DMA::ImageOperand& src,
+                       const Tegra::DMA::BufferOperand& dst) override {
+        return false;
+    }
+    bool BufferToImage(const Tegra::DMA::ImageCopy& copy_info, const Tegra::DMA::BufferOperand& src,
+                       const Tegra::DMA::ImageOperand& dst) override {
+        return false;
+    }
 };
 
 class RasterizerNull final : public VideoCore::RasterizerAccelerated,
diff --git a/src/video_core/renderer_opengl/blit_image.cpp b/src/video_core/renderer_opengl/blit_image.cpp
index 9a560a73b..3b03e8d5a 100644
--- a/src/video_core/renderer_opengl/blit_image.cpp
+++ b/src/video_core/renderer_opengl/blit_image.cpp
@@ -22,7 +22,7 @@ BlitImageHelper::~BlitImageHelper() = default;
 void BlitImageHelper::BlitColor(GLuint dst_framebuffer, GLuint src_image_view, GLuint src_sampler,
                                 const Region2D& dst_region, const Region2D& src_region,
                                 const Extent3D& src_size) {
-    glEnable(GL_CULL_FACE);
+    glDisable(GL_CULL_FACE);
     glDisable(GL_COLOR_LOGIC_OP);
     glDisable(GL_DEPTH_TEST);
     glDisable(GL_STENCIL_TEST);
@@ -31,7 +31,6 @@ void BlitImageHelper::BlitColor(GLuint dst_framebuffer, GLuint src_image_view, G
     glDisable(GL_ALPHA_TEST);
     glDisablei(GL_BLEND, 0);
     glPolygonMode(GL_FRONT_AND_BACK, GL_FILL);
-    glCullFace(GL_BACK);
     glFrontFace(GL_CW);
     glColorMaski(0, GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE);
     glDepthRangeIndexed(0, 0.0, 0.0);
diff --git a/src/video_core/renderer_opengl/gl_fence_manager.cpp b/src/video_core/renderer_opengl/gl_fence_manager.cpp
index 91463f854..5326172af 100644
--- a/src/video_core/renderer_opengl/gl_fence_manager.cpp
+++ b/src/video_core/renderer_opengl/gl_fence_manager.cpp
@@ -27,9 +27,7 @@ bool GLInnerFence::IsSignaled() const {
         return true;
     }
     ASSERT(sync_object.handle != 0);
-    GLint sync_status;
-    glGetSynciv(sync_object.handle, GL_SYNC_STATUS, 1, nullptr, &sync_status);
-    return sync_status == GL_SIGNALED;
+    return sync_object.IsSignaled();
 }
 
 void GLInnerFence::Wait() {
diff --git a/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp b/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp
index 29491e762..89000d6e0 100644
--- a/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp
+++ b/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp
@@ -621,10 +621,7 @@ bool GraphicsPipeline::IsBuilt() noexcept {
     if (built_fence.handle == 0) {
         return false;
     }
-    // Timeout of zero means this is non-blocking
-    const auto sync_status = glClientWaitSync(built_fence.handle, 0, 0);
-    ASSERT(sync_status != GL_WAIT_FAILED);
-    is_built = sync_status != GL_TIMEOUT_EXPIRED;
+    is_built = built_fence.IsSignaled();
     return is_built;
 }
 
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index 7bced675c..90e35e307 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -63,7 +63,7 @@ RasterizerOpenGL::RasterizerOpenGL(Core::Frontend::EmuWindow& emu_window_, Tegra
       buffer_cache(*this, cpu_memory_, buffer_cache_runtime),
       shader_cache(*this, emu_window_, device, texture_cache, buffer_cache, program_manager,
                    state_tracker, gpu.ShaderNotify()),
-      query_cache(*this), accelerate_dma(buffer_cache),
+      query_cache(*this), accelerate_dma(buffer_cache, texture_cache),
       fence_manager(*this, gpu, texture_cache, buffer_cache, query_cache),
       blit_image(program_manager_) {}
 
@@ -357,6 +357,7 @@ void RasterizerOpenGL::DrawTexture() {
                                         .y = static_cast<s32>(draw_texture_state.src_y1)}};
         blit_image.BlitColor(texture_cache.GetFramebuffer()->Handle(), texture.DefaultHandle(),
                              sampler->Handle(), dst_region, src_region, texture.size);
+        state_tracker.InvalidateState();
     }
 
     ++num_queued_commands;
@@ -576,7 +577,7 @@ bool RasterizerOpenGL::AccelerateConditionalRendering() {
         // Reimplement Host conditional rendering.
         return false;
     }
-    // Medium / Low Hack: stub any checks on queries writen into the buffer cache.
+    // Medium / Low Hack: stub any checks on queries written into the buffer cache.
     const GPUVAddr condition_address{maxwell3d->regs.render_enable.Address()};
     Maxwell::ReportSemaphore::Compare cmp;
     if (gpu_memory->IsMemoryDirty(condition_address, sizeof(cmp),
@@ -1262,7 +1263,8 @@ void RasterizerOpenGL::ReleaseChannel(s32 channel_id) {
     query_cache.EraseChannel(channel_id);
 }
 
-AccelerateDMA::AccelerateDMA(BufferCache& buffer_cache_) : buffer_cache{buffer_cache_} {}
+AccelerateDMA::AccelerateDMA(BufferCache& buffer_cache_, TextureCache& texture_cache_)
+    : buffer_cache{buffer_cache_}, texture_cache{texture_cache_} {}
 
 bool AccelerateDMA::BufferCopy(GPUVAddr src_address, GPUVAddr dest_address, u64 amount) {
     std::scoped_lock lock{buffer_cache.mutex};
@@ -1274,4 +1276,44 @@ bool AccelerateDMA::BufferClear(GPUVAddr src_address, u64 amount, u32 value) {
     return buffer_cache.DMAClear(src_address, amount, value);
 }
 
+template <bool IS_IMAGE_UPLOAD>
+bool AccelerateDMA::DmaBufferImageCopy(const Tegra::DMA::ImageCopy& copy_info,
+                                       const Tegra::DMA::BufferOperand& buffer_operand,
+                                       const Tegra::DMA::ImageOperand& image_operand) {
+    std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex};
+    const auto image_id = texture_cache.DmaImageId(image_operand);
+    if (image_id == VideoCommon::NULL_IMAGE_ID) {
+        return false;
+    }
+    const u32 buffer_size = static_cast<u32>(buffer_operand.pitch * buffer_operand.height);
+    static constexpr auto sync_info = VideoCommon::ObtainBufferSynchronize::FullSynchronize;
+    const auto post_op = IS_IMAGE_UPLOAD ? VideoCommon::ObtainBufferOperation::DoNothing
+                                         : VideoCommon::ObtainBufferOperation::MarkAsWritten;
+    const auto [buffer, offset] =
+        buffer_cache.ObtainBuffer(buffer_operand.address, buffer_size, sync_info, post_op);
+
+    const auto [image, copy] = texture_cache.DmaBufferImageCopy(
+        copy_info, buffer_operand, image_operand, image_id, IS_IMAGE_UPLOAD);
+    const std::span copy_span{&copy, 1};
+
+    if constexpr (IS_IMAGE_UPLOAD) {
+        image->UploadMemory(buffer->Handle(), offset, copy_span);
+    } else {
+        image->DownloadMemory(buffer->Handle(), offset, copy_span);
+    }
+    return true;
+}
+
+bool AccelerateDMA::ImageToBuffer(const Tegra::DMA::ImageCopy& copy_info,
+                                  const Tegra::DMA::ImageOperand& image_operand,
+                                  const Tegra::DMA::BufferOperand& buffer_operand) {
+    return DmaBufferImageCopy<false>(copy_info, buffer_operand, image_operand);
+}
+
+bool AccelerateDMA::BufferToImage(const Tegra::DMA::ImageCopy& copy_info,
+                                  const Tegra::DMA::BufferOperand& buffer_operand,
+                                  const Tegra::DMA::ImageOperand& image_operand) {
+    return DmaBufferImageCopy<true>(copy_info, buffer_operand, image_operand);
+}
+
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h
index 0c45832ae..ad6978bd0 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -50,14 +50,26 @@ static_assert(sizeof(BindlessSSBO) * CHAR_BIT == 128);
 
 class AccelerateDMA : public Tegra::Engines::AccelerateDMAInterface {
 public:
-    explicit AccelerateDMA(BufferCache& buffer_cache);
+    explicit AccelerateDMA(BufferCache& buffer_cache, TextureCache& texture_cache);
 
     bool BufferCopy(GPUVAddr src_address, GPUVAddr dest_address, u64 amount) override;
 
     bool BufferClear(GPUVAddr src_address, u64 amount, u32 value) override;
 
+    bool ImageToBuffer(const Tegra::DMA::ImageCopy& copy_info, const Tegra::DMA::ImageOperand& src,
+                       const Tegra::DMA::BufferOperand& dst) override;
+
+    bool BufferToImage(const Tegra::DMA::ImageCopy& copy_info, const Tegra::DMA::BufferOperand& src,
+                       const Tegra::DMA::ImageOperand& dst) override;
+
 private:
+    template <bool IS_IMAGE_UPLOAD>
+    bool DmaBufferImageCopy(const Tegra::DMA::ImageCopy& copy_info,
+                            const Tegra::DMA::BufferOperand& src,
+                            const Tegra::DMA::ImageOperand& dst);
+
     BufferCache& buffer_cache;
+    TextureCache& texture_cache;
 };
 
 class RasterizerOpenGL : public VideoCore::RasterizerAccelerated,
@@ -150,7 +162,7 @@ private:
     /// Syncs the cull mode to match the guest state
     void SyncCullMode();
 
-    /// Syncs the primitve restart to match the guest state
+    /// Syncs the primitive restart to match the guest state
     void SyncPrimitiveRestart();
 
     /// Syncs the depth test state to match the guest state
@@ -234,7 +246,7 @@ private:
     std::array<GLuint, MAX_TEXTURES> texture_handles{};
     std::array<GLuint, MAX_IMAGES> image_handles{};
 
-    /// Number of commands queued to the OpenGL driver. Resetted on flush.
+    /// Number of commands queued to the OpenGL driver. Reset on flush.
     size_t num_queued_commands = 0;
     bool has_written_global_memory = false;
 
diff --git a/src/video_core/renderer_opengl/gl_resource_manager.cpp b/src/video_core/renderer_opengl/gl_resource_manager.cpp
index 3a664fdec..eae8fd110 100644
--- a/src/video_core/renderer_opengl/gl_resource_manager.cpp
+++ b/src/video_core/renderer_opengl/gl_resource_manager.cpp
@@ -3,6 +3,7 @@
 
 #include <string_view>
 #include <glad/glad.h>
+#include "common/assert.h"
 #include "common/microprofile.h"
 #include "video_core/renderer_opengl/gl_resource_manager.h"
 #include "video_core/renderer_opengl/gl_shader_util.h"
@@ -158,6 +159,15 @@ void OGLSync::Release() {
     handle = 0;
 }
 
+bool OGLSync::IsSignaled() const noexcept {
+    // At least on Nvidia, glClientWaitSync with a timeout of 0
+    // is faster than glGetSynciv of GL_SYNC_STATUS.
+    // Timeout of 0 means this check is non-blocking.
+    const auto sync_status = glClientWaitSync(handle, 0, 0);
+    ASSERT(sync_status != GL_WAIT_FAILED);
+    return sync_status != GL_TIMEOUT_EXPIRED;
+}
+
 void OGLFramebuffer::Create() {
     if (handle != 0)
         return;
diff --git a/src/video_core/renderer_opengl/gl_resource_manager.h b/src/video_core/renderer_opengl/gl_resource_manager.h
index bc05ba4bd..77362acd2 100644
--- a/src/video_core/renderer_opengl/gl_resource_manager.h
+++ b/src/video_core/renderer_opengl/gl_resource_manager.h
@@ -263,6 +263,9 @@ public:
     /// Deletes the internal OpenGL resource
     void Release();
 
+    /// Checks if the sync has been signaled
+    bool IsSignaled() const noexcept;
+
     GLsync handle = 0;
 };
 
diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp
index b047e7b3d..0b9c4a904 100644
--- a/src/video_core/renderer_opengl/gl_texture_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp
@@ -112,13 +112,17 @@ GLenum ImageTarget(Shader::TextureType type, int num_samples = 1) {
     return GL_NONE;
 }
 
-GLenum TextureMode(PixelFormat format, bool is_first) {
+GLenum TextureMode(PixelFormat format, std::array<SwizzleSource, 4> swizzle) {
+    bool any_r =
+        std::ranges::any_of(swizzle, [](SwizzleSource s) { return s == SwizzleSource::R; });
     switch (format) {
     case PixelFormat::D24_UNORM_S8_UINT:
     case PixelFormat::D32_FLOAT_S8_UINT:
-        return is_first ? GL_DEPTH_COMPONENT : GL_STENCIL_INDEX;
+        // R = depth, G = stencil
+        return any_r ? GL_DEPTH_COMPONENT : GL_STENCIL_INDEX;
     case PixelFormat::S8_UINT_D24_UNORM:
-        return is_first ? GL_STENCIL_INDEX : GL_DEPTH_COMPONENT;
+        // R = stencil, G = depth
+        return any_r ? GL_STENCIL_INDEX : GL_DEPTH_COMPONENT;
     default:
         ASSERT(false);
         return GL_DEPTH_COMPONENT;
@@ -208,8 +212,7 @@ void ApplySwizzle(GLuint handle, PixelFormat format, std::array<SwizzleSource, 4
     case PixelFormat::D32_FLOAT_S8_UINT:
     case PixelFormat::S8_UINT_D24_UNORM:
         UNIMPLEMENTED_IF(swizzle[0] != SwizzleSource::R && swizzle[0] != SwizzleSource::G);
-        glTextureParameteri(handle, GL_DEPTH_STENCIL_TEXTURE_MODE,
-                            TextureMode(format, swizzle[0] == SwizzleSource::R));
+        glTextureParameteri(handle, GL_DEPTH_STENCIL_TEXTURE_MODE, TextureMode(format, swizzle));
         std::ranges::transform(swizzle, swizzle.begin(), ConvertGreenRed);
         break;
     case PixelFormat::A5B5G5R1_UNORM: {
@@ -714,9 +717,7 @@ std::optional<size_t> TextureCacheRuntime::StagingBuffers::FindBuffer(size_t req
             continue;
         }
         if (syncs[index].handle != 0) {
-            GLint status;
-            glGetSynciv(syncs[index].handle, GL_SYNC_STATUS, 1, nullptr, &status);
-            if (status != GL_SIGNALED) {
+            if (!syncs[index].IsSignaled()) {
                 continue;
             }
             syncs[index].Release();
@@ -762,14 +763,14 @@ Image::Image(const VideoCommon::NullImageParams& params) : VideoCommon::ImageBas
 
 Image::~Image() = default;
 
-void Image::UploadMemory(const ImageBufferMap& map,
+void Image::UploadMemory(GLuint buffer_handle, size_t buffer_offset,
                          std::span<const VideoCommon::BufferImageCopy> copies) {
     const bool is_rescaled = True(flags & ImageFlagBits::Rescaled);
     if (is_rescaled) {
         ScaleDown(true);
     }
-    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, map.buffer);
-    glFlushMappedBufferRange(GL_PIXEL_UNPACK_BUFFER, map.offset, unswizzled_size_bytes);
+    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, buffer_handle);
+    glFlushMappedBufferRange(GL_PIXEL_UNPACK_BUFFER, buffer_offset, unswizzled_size_bytes);
 
     glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
 
@@ -788,21 +789,26 @@ void Image::UploadMemory(const ImageBufferMap& map,
             current_image_height = copy.buffer_image_height;
             glPixelStorei(GL_UNPACK_IMAGE_HEIGHT, current_image_height);
         }
-        CopyBufferToImage(copy, map.offset);
+        CopyBufferToImage(copy, buffer_offset);
     }
     if (is_rescaled) {
         ScaleUp();
     }
 }
 
-void Image::DownloadMemory(ImageBufferMap& map,
+void Image::UploadMemory(const ImageBufferMap& map,
+                         std::span<const VideoCommon::BufferImageCopy> copies) {
+    UploadMemory(map.buffer, map.offset, copies);
+}
+
+void Image::DownloadMemory(GLuint buffer_handle, size_t buffer_offset,
                            std::span<const VideoCommon::BufferImageCopy> copies) {
     const bool is_rescaled = True(flags & ImageFlagBits::Rescaled);
     if (is_rescaled) {
         ScaleDown();
     }
     glMemoryBarrier(GL_PIXEL_BUFFER_BARRIER_BIT); // TODO: Move this to its own API
-    glBindBuffer(GL_PIXEL_PACK_BUFFER, map.buffer);
+    glBindBuffer(GL_PIXEL_PACK_BUFFER, buffer_handle);
     glPixelStorei(GL_PACK_ALIGNMENT, 1);
 
     u32 current_row_length = std::numeric_limits<u32>::max();
@@ -820,13 +826,18 @@ void Image::DownloadMemory(ImageBufferMap& map,
             current_image_height = copy.buffer_image_height;
             glPixelStorei(GL_PACK_IMAGE_HEIGHT, current_image_height);
         }
-        CopyImageToBuffer(copy, map.offset);
+        CopyImageToBuffer(copy, buffer_offset);
     }
     if (is_rescaled) {
         ScaleUp(true);
     }
 }
 
+void Image::DownloadMemory(ImageBufferMap& map,
+                           std::span<const VideoCommon::BufferImageCopy> copies) {
+    DownloadMemory(map.buffer, map.offset, copies);
+}
+
 GLuint Image::StorageHandle() noexcept {
     switch (info.format) {
     case PixelFormat::A8B8G8R8_SRGB:
diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h
index e30875496..911e4607a 100644
--- a/src/video_core/renderer_opengl/gl_texture_cache.h
+++ b/src/video_core/renderer_opengl/gl_texture_cache.h
@@ -206,9 +206,15 @@ public:
     Image(Image&&) = default;
     Image& operator=(Image&&) = default;
 
+    void UploadMemory(GLuint buffer_handle, size_t buffer_offset,
+                      std::span<const VideoCommon::BufferImageCopy> copies);
+
     void UploadMemory(const ImageBufferMap& map,
                       std::span<const VideoCommon::BufferImageCopy> copies);
 
+    void DownloadMemory(GLuint buffer_handle, size_t buffer_offset,
+                        std::span<const VideoCommon::BufferImageCopy> copies);
+
     void DownloadMemory(ImageBufferMap& map, std::span<const VideoCommon::BufferImageCopy> copies);
 
     GLuint StorageHandle() noexcept;
diff --git a/src/video_core/renderer_vulkan/fixed_pipeline_state.cpp b/src/video_core/renderer_vulkan/fixed_pipeline_state.cpp
index f8398b511..e7df32d84 100644
--- a/src/video_core/renderer_vulkan/fixed_pipeline_state.cpp
+++ b/src/video_core/renderer_vulkan/fixed_pipeline_state.cpp
@@ -271,7 +271,7 @@ bool FixedPipelineState::operator==(const FixedPipelineState& rhs) const noexcep
 
 u32 FixedPipelineState::PackComparisonOp(Maxwell::ComparisonOp op) noexcept {
     // OpenGL enums go from 0x200 to 0x207 and the others from 1 to 8
-    // If we substract 0x200 to OpenGL enums and 1 to the others we get a 0-7 range.
+    // If we subtract 0x200 to OpenGL enums and 1 to the others we get a 0-7 range.
     // Perfect for a hash.
     const u32 value = static_cast<u32>(op);
     return value - (value >= 0x200 ? 0x200 : 1);
@@ -322,8 +322,8 @@ Maxwell::StencilOp::Op FixedPipelineState::UnpackStencilOp(u32 packed) noexcept
 }
 
 u32 FixedPipelineState::PackCullFace(Maxwell::CullFace cull) noexcept {
-    // FrontAndBack is 0x408, by substracting 0x406 in it we get 2.
-    // Individual cull faces are in 0x404 and 0x405, substracting 0x404 we get 0 and 1.
+    // FrontAndBack is 0x408, by subtracting 0x406 in it we get 2.
+    // Individual cull faces are in 0x404 and 0x405, subtracting 0x404 we get 0 and 1.
     const u32 value = static_cast<u32>(cull);
     return value - (value == 0x408 ? 0x406 : 0x404);
 }
diff --git a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
index ca52e2389..5dce51be8 100644
--- a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
+++ b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
@@ -166,7 +166,7 @@ struct FormatTuple {
     {VK_FORMAT_R16G16_UINT, Attachable | Storage},             // R16G16_UINT
     {VK_FORMAT_R16G16_SINT, Attachable | Storage},             // R16G16_SINT
     {VK_FORMAT_R16G16_SNORM, Attachable | Storage},            // R16G16_SNORM
-    {VK_FORMAT_UNDEFINED},                                     // R32G32B32_FLOAT
+    {VK_FORMAT_R32G32B32_SFLOAT},                              // R32G32B32_FLOAT
     {VK_FORMAT_A8B8G8R8_SRGB_PACK32, Attachable},              // A8B8G8R8_SRGB
     {VK_FORMAT_R8G8_UNORM, Attachable | Storage},              // R8G8_UNORM
     {VK_FORMAT_R8G8_SNORM, Attachable | Storage},              // R8G8_SNORM
@@ -234,11 +234,6 @@ FormatInfo SurfaceFormat(const Device& device, FormatType format_type, bool with
                          PixelFormat pixel_format) {
     ASSERT(static_cast<size_t>(pixel_format) < std::size(tex_format_tuples));
     FormatTuple tuple = tex_format_tuples[static_cast<size_t>(pixel_format)];
-    if (tuple.format == VK_FORMAT_UNDEFINED) {
-        UNIMPLEMENTED_MSG("Unimplemented texture format with pixel format={}", pixel_format);
-        return FormatInfo{VK_FORMAT_A8B8G8R8_UNORM_PACK32, true, true};
-    }
-
     // Use A8B8G8R8_UNORM on hardware that doesn't support ASTC natively
     if (!device.IsOptimalAstcSupported() && VideoCore::Surface::IsPixelFormatASTC(pixel_format)) {
         const bool is_srgb = with_srgb && VideoCore::Surface::IsPixelFormatSRGB(pixel_format);
diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
index b0153a502..9cbcb3c8f 100644
--- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
@@ -238,7 +238,7 @@ private:
         return indices;
     }
 
-    void MakeAndUpdateIndices(u8* staging_data, size_t quad_size, u32 quad, u32 first) {
+    void MakeAndUpdateIndices(u8* staging_data, size_t quad_size, u32 quad, u32 first) override {
         switch (index_type) {
         case VK_INDEX_TYPE_UINT8_EXT:
             std::memcpy(staging_data, MakeIndices<u8>(quad, first).data(), quad_size);
@@ -278,7 +278,7 @@ private:
         return indices;
     }
 
-    void MakeAndUpdateIndices(u8* staging_data, size_t quad_size, u32 quad, u32 first) {
+    void MakeAndUpdateIndices(u8* staging_data, size_t quad_size, u32 quad, u32 first) override {
         switch (index_type) {
         case VK_INDEX_TYPE_UINT8_EXT:
             std::memcpy(staging_data, MakeIndices<u8>(quad, first).data(), quad_size);
diff --git a/src/video_core/renderer_vulkan/vk_command_pool.cpp b/src/video_core/renderer_vulkan/vk_command_pool.cpp
index 2f09de1c1..d0dbf7ca5 100644
--- a/src/video_core/renderer_vulkan/vk_command_pool.cpp
+++ b/src/video_core/renderer_vulkan/vk_command_pool.cpp
@@ -22,8 +22,8 @@ CommandPool::CommandPool(MasterSemaphore& master_semaphore_, const Device& devic
 CommandPool::~CommandPool() = default;
 
 void CommandPool::Allocate(size_t begin, size_t end) {
-    // Command buffers are going to be commited, recorded, executed every single usage cycle.
-    // They are also going to be reseted when commited.
+    // Command buffers are going to be committed, recorded, executed every single usage cycle.
+    // They are also going to be reset when committed.
     Pool& pool = pools.emplace_back();
     pool.handle = device.GetLogical().CreateCommandPool({
         .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
index 719edbcfb..673ab478e 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
@@ -172,7 +172,7 @@ RasterizerVulkan::RasterizerVulkan(Core::Frontend::EmuWindow& emu_window_, Tegra
       buffer_cache(*this, cpu_memory_, buffer_cache_runtime),
       pipeline_cache(*this, device, scheduler, descriptor_pool, update_descriptor_queue,
                      render_pass_cache, buffer_cache, texture_cache, gpu.ShaderNotify()),
-      query_cache{*this, device, scheduler}, accelerate_dma{buffer_cache},
+      query_cache{*this, device, scheduler}, accelerate_dma(buffer_cache, texture_cache, scheduler),
       fence_manager(*this, gpu, texture_cache, buffer_cache, query_cache, device, scheduler),
       wfi_event(device.GetLogical().CreateEvent()) {
     scheduler.SetQueryCache(query_cache);
@@ -671,7 +671,7 @@ bool RasterizerVulkan::AccelerateConditionalRendering() {
         // TODO(Blinkhawk): Reimplement Host conditional rendering.
         return false;
     }
-    // Medium / Low Hack: stub any checks on queries writen into the buffer cache.
+    // Medium / Low Hack: stub any checks on queries written into the buffer cache.
     const GPUVAddr condition_address{maxwell3d->regs.render_enable.Address()};
     Maxwell::ReportSemaphore::Compare cmp;
     if (gpu_memory->IsMemoryDirty(condition_address, sizeof(cmp),
@@ -756,7 +756,9 @@ void RasterizerVulkan::FlushWork() {
     draw_counter = 0;
 }
 
-AccelerateDMA::AccelerateDMA(BufferCache& buffer_cache_) : buffer_cache{buffer_cache_} {}
+AccelerateDMA::AccelerateDMA(BufferCache& buffer_cache_, TextureCache& texture_cache_,
+                             Scheduler& scheduler_)
+    : buffer_cache{buffer_cache_}, texture_cache{texture_cache_}, scheduler{scheduler_} {}
 
 bool AccelerateDMA::BufferClear(GPUVAddr src_address, u64 amount, u32 value) {
     std::scoped_lock lock{buffer_cache.mutex};
@@ -768,6 +770,46 @@ bool AccelerateDMA::BufferCopy(GPUVAddr src_address, GPUVAddr dest_address, u64
     return buffer_cache.DMACopy(src_address, dest_address, amount);
 }
 
+template <bool IS_IMAGE_UPLOAD>
+bool AccelerateDMA::DmaBufferImageCopy(const Tegra::DMA::ImageCopy& copy_info,
+                                       const Tegra::DMA::BufferOperand& buffer_operand,
+                                       const Tegra::DMA::ImageOperand& image_operand) {
+    std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex};
+    const auto image_id = texture_cache.DmaImageId(image_operand);
+    if (image_id == VideoCommon::NULL_IMAGE_ID) {
+        return false;
+    }
+    const u32 buffer_size = static_cast<u32>(buffer_operand.pitch * buffer_operand.height);
+    static constexpr auto sync_info = VideoCommon::ObtainBufferSynchronize::FullSynchronize;
+    const auto post_op = IS_IMAGE_UPLOAD ? VideoCommon::ObtainBufferOperation::DoNothing
+                                         : VideoCommon::ObtainBufferOperation::MarkAsWritten;
+    const auto [buffer, offset] =
+        buffer_cache.ObtainBuffer(buffer_operand.address, buffer_size, sync_info, post_op);
+
+    const auto [image, copy] = texture_cache.DmaBufferImageCopy(
+        copy_info, buffer_operand, image_operand, image_id, IS_IMAGE_UPLOAD);
+    const std::span copy_span{&copy, 1};
+
+    if constexpr (IS_IMAGE_UPLOAD) {
+        image->UploadMemory(buffer->Handle(), offset, copy_span);
+    } else {
+        image->DownloadMemory(buffer->Handle(), offset, copy_span);
+    }
+    return true;
+}
+
+bool AccelerateDMA::ImageToBuffer(const Tegra::DMA::ImageCopy& copy_info,
+                                  const Tegra::DMA::ImageOperand& image_operand,
+                                  const Tegra::DMA::BufferOperand& buffer_operand) {
+    return DmaBufferImageCopy<false>(copy_info, buffer_operand, image_operand);
+}
+
+bool AccelerateDMA::BufferToImage(const Tegra::DMA::ImageCopy& copy_info,
+                                  const Tegra::DMA::BufferOperand& buffer_operand,
+                                  const Tegra::DMA::ImageOperand& image_operand) {
+    return DmaBufferImageCopy<true>(copy_info, buffer_operand, image_operand);
+}
+
 void RasterizerVulkan::UpdateDynamicStates() {
     auto& regs = maxwell3d->regs;
     UpdateViewportsState(regs);
@@ -1064,7 +1106,7 @@ void RasterizerVulkan::UpdateDepthBoundsTestEnable(Tegra::Engines::Maxwell3D::Re
         LOG_WARNING(Render_Vulkan, "Depth bounds is enabled but not supported");
         enabled = false;
     }
-    scheduler.Record([enable = regs.depth_bounds_enable](vk::CommandBuffer cmdbuf) {
+    scheduler.Record([enable = enabled](vk::CommandBuffer cmdbuf) {
         cmdbuf.SetDepthBoundsTestEnableEXT(enable);
     });
 }
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h
index a0508b57c..1659fbc13 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.h
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.h
@@ -45,14 +45,28 @@ class StateTracker;
 
 class AccelerateDMA : public Tegra::Engines::AccelerateDMAInterface {
 public:
-    explicit AccelerateDMA(BufferCache& buffer_cache);
+    explicit AccelerateDMA(BufferCache& buffer_cache, TextureCache& texture_cache,
+                           Scheduler& scheduler);
 
     bool BufferCopy(GPUVAddr start_address, GPUVAddr end_address, u64 amount) override;
 
     bool BufferClear(GPUVAddr src_address, u64 amount, u32 value) override;
 
+    bool ImageToBuffer(const Tegra::DMA::ImageCopy& copy_info, const Tegra::DMA::ImageOperand& src,
+                       const Tegra::DMA::BufferOperand& dst) override;
+
+    bool BufferToImage(const Tegra::DMA::ImageCopy& copy_info, const Tegra::DMA::BufferOperand& src,
+                       const Tegra::DMA::ImageOperand& dst) override;
+
 private:
+    template <bool IS_IMAGE_UPLOAD>
+    bool DmaBufferImageCopy(const Tegra::DMA::ImageCopy& copy_info,
+                            const Tegra::DMA::BufferOperand& src,
+                            const Tegra::DMA::ImageOperand& dst);
+
     BufferCache& buffer_cache;
+    TextureCache& texture_cache;
+    Scheduler& scheduler;
 };
 
 class RasterizerVulkan final : public VideoCore::RasterizerAccelerated,
diff --git a/src/video_core/renderer_vulkan/vk_resource_pool.cpp b/src/video_core/renderer_vulkan/vk_resource_pool.cpp
index 6c8ac22f4..6572f82ba 100644
--- a/src/video_core/renderer_vulkan/vk_resource_pool.cpp
+++ b/src/video_core/renderer_vulkan/vk_resource_pool.cpp
@@ -37,7 +37,7 @@ size_t ResourcePool::CommitResource() {
             found = free_resource;
         }
     }
-    // Free iterator is hinted to the resource after the one that's been commited.
+    // Free iterator is hinted to the resource after the one that's been committed.
     hint_iterator = (*found + 1) % ticks.size();
     return *found;
 }
@@ -46,7 +46,7 @@ size_t ResourcePool::ManageOverflow() {
     const size_t old_capacity = ticks.size();
     Grow();
 
-    // The last entry is guaranted to be free, since it's the first element of the freshly
+    // The last entry is guaranteed to be free, since it's the first element of the freshly
     // allocated resources.
     return old_capacity;
 }
diff --git a/src/video_core/renderer_vulkan/vk_scheduler.cpp b/src/video_core/renderer_vulkan/vk_scheduler.cpp
index e03685af1..c636a1625 100644
--- a/src/video_core/renderer_vulkan/vk_scheduler.cpp
+++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp
@@ -47,14 +47,15 @@ Scheduler::Scheduler(const Device& device_, StateTracker& state_tracker_)
 Scheduler::~Scheduler() = default;
 
 void Scheduler::Flush(VkSemaphore signal_semaphore, VkSemaphore wait_semaphore) {
+    // When flushing, we only send data to the worker thread; no waiting is necessary.
     SubmitExecution(signal_semaphore, wait_semaphore);
     AllocateNewContext();
 }
 
 void Scheduler::Finish(VkSemaphore signal_semaphore, VkSemaphore wait_semaphore) {
+    // When finishing, we need to wait for the submission to have executed on the device.
     const u64 presubmit_tick = CurrentTick();
     SubmitExecution(signal_semaphore, wait_semaphore);
-    WaitWorker();
     Wait(presubmit_tick);
     AllocateNewContext();
 }
@@ -63,8 +64,13 @@ void Scheduler::WaitWorker() {
     MICROPROFILE_SCOPE(Vulkan_WaitForWorker);
     DispatchWork();
 
-    std::unique_lock lock{work_mutex};
-    wait_cv.wait(lock, [this] { return work_queue.empty(); });
+    // Ensure the queue is drained.
+    std::unique_lock ql{queue_mutex};
+    event_cv.wait(ql, [this] { return work_queue.empty(); });
+
+    // Now wait for execution to finish.
+    // This needs to be done in the same order as WorkerThread.
+    std::unique_lock el{execution_mutex};
 }
 
 void Scheduler::DispatchWork() {
@@ -72,10 +78,10 @@ void Scheduler::DispatchWork() {
         return;
     }
     {
-        std::scoped_lock lock{work_mutex};
+        std::scoped_lock ql{queue_mutex};
         work_queue.push(std::move(chunk));
     }
-    work_cv.notify_one();
+    event_cv.notify_all();
     AcquireNewChunk();
 }
 
@@ -137,30 +143,55 @@ bool Scheduler::UpdateRescaling(bool is_rescaling) {
 
 void Scheduler::WorkerThread(std::stop_token stop_token) {
     Common::SetCurrentThreadName("VulkanWorker");
-    do {
+
+    const auto TryPopQueue{[this](auto& work) -> bool {
+        if (work_queue.empty()) {
+            return false;
+        }
+
+        work = std::move(work_queue.front());
+        work_queue.pop();
+        event_cv.notify_all();
+        return true;
+    }};
+
+    while (!stop_token.stop_requested()) {
         std::unique_ptr<CommandChunk> work;
-        bool has_submit{false};
+
         {
-            std::unique_lock lock{work_mutex};
-            if (work_queue.empty()) {
-                wait_cv.notify_all();
-            }
-            Common::CondvarWait(work_cv, lock, stop_token, [&] { return !work_queue.empty(); });
+            std::unique_lock lk{queue_mutex};
+
+            // Wait for work.
+            Common::CondvarWait(event_cv, lk, stop_token, [&] { return TryPopQueue(work); });
+
+            // If we've been asked to stop, we're done.
             if (stop_token.stop_requested()) {
-                continue;
+                return;
             }
-            work = std::move(work_queue.front());
-            work_queue.pop();
 
-            has_submit = work->HasSubmit();
+            // Exchange lock ownership so that we take the execution lock before
+            // the queue lock goes out of scope. This allows us to force execution
+            // to complete in the next step.
+            std::exchange(lk, std::unique_lock{execution_mutex});
+
+            // Perform the work, tracking whether the chunk was a submission
+            // before executing.
+            const bool has_submit = work->HasSubmit();
             work->ExecuteAll(current_cmdbuf);
+
+            // If the chunk was a submission, reallocate the command buffer.
+            if (has_submit) {
+                AllocateWorkerCommandBuffer();
+            }
         }
-        if (has_submit) {
-            AllocateWorkerCommandBuffer();
+
+        {
+            std::scoped_lock rl{reserve_mutex};
+
+            // Recycle the chunk back to the reserve.
+            chunk_reserve.emplace_back(std::move(work));
         }
-        std::scoped_lock reserve_lock{reserve_mutex};
-        chunk_reserve.push_back(std::move(work));
-    } while (!stop_token.stop_requested());
+    }
 }
 
 void Scheduler::AllocateWorkerCommandBuffer() {
@@ -289,13 +320,16 @@ void Scheduler::EndRenderPass() {
 }
 
 void Scheduler::AcquireNewChunk() {
-    std::scoped_lock lock{reserve_mutex};
+    std::scoped_lock rl{reserve_mutex};
+
     if (chunk_reserve.empty()) {
+        // If we don't have anything reserved, we need to make a new chunk.
         chunk = std::make_unique<CommandChunk>();
-        return;
+    } else {
+        // Otherwise, we can just take from the reserve.
+        chunk = std::make_unique<CommandChunk>();
+        chunk_reserve.pop_back();
     }
-    chunk = std::move(chunk_reserve.back());
-    chunk_reserve.pop_back();
 }
 
 } // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_scheduler.h b/src/video_core/renderer_vulkan/vk_scheduler.h
index bd4cb0f7e..8d75ce987 100644
--- a/src/video_core/renderer_vulkan/vk_scheduler.h
+++ b/src/video_core/renderer_vulkan/vk_scheduler.h
@@ -232,10 +232,10 @@ private:
 
     std::queue<std::unique_ptr<CommandChunk>> work_queue;
     std::vector<std::unique_ptr<CommandChunk>> chunk_reserve;
+    std::mutex execution_mutex;
     std::mutex reserve_mutex;
-    std::mutex work_mutex;
-    std::condition_variable_any work_cv;
-    std::condition_variable wait_cv;
+    std::mutex queue_mutex;
+    std::condition_variable_any event_cv;
     std::jthread worker_thread;
 };
 
diff --git a/src/video_core/renderer_vulkan/vk_swapchain.cpp b/src/video_core/renderer_vulkan/vk_swapchain.cpp
index b6810eef9..85fdce6e5 100644
--- a/src/video_core/renderer_vulkan/vk_swapchain.cpp
+++ b/src/video_core/renderer_vulkan/vk_swapchain.cpp
@@ -159,7 +159,7 @@ void Swapchain::CreateSwapchain(const VkSurfaceCapabilitiesKHR& capabilities, bo
     present_mode = ChooseSwapPresentMode(present_modes);
 
     u32 requested_image_count{capabilities.minImageCount + 1};
-    // Ensure Tripple buffering if possible.
+    // Ensure Triple buffering if possible.
     if (capabilities.maxImageCount > 0) {
         if (requested_image_count > capabilities.maxImageCount) {
             requested_image_count = capabilities.maxImageCount;
diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp
index 80adb70eb..ae15f6976 100644
--- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp
@@ -189,13 +189,16 @@ constexpr VkBorderColor ConvertBorderColor(const std::array<float, 4>& color) {
     if (info.IsRenderTarget()) {
         return ImageAspectMask(info.format);
     }
-    const bool is_first = info.Swizzle()[0] == SwizzleSource::R;
+    bool any_r =
+        std::ranges::any_of(info.Swizzle(), [](SwizzleSource s) { return s == SwizzleSource::R; });
     switch (info.format) {
     case PixelFormat::D24_UNORM_S8_UINT:
     case PixelFormat::D32_FLOAT_S8_UINT:
-        return is_first ? VK_IMAGE_ASPECT_DEPTH_BIT : VK_IMAGE_ASPECT_STENCIL_BIT;
+        // R = depth, G = stencil
+        return any_r ? VK_IMAGE_ASPECT_DEPTH_BIT : VK_IMAGE_ASPECT_STENCIL_BIT;
     case PixelFormat::S8_UINT_D24_UNORM:
-        return is_first ? VK_IMAGE_ASPECT_STENCIL_BIT : VK_IMAGE_ASPECT_DEPTH_BIT;
+        // R = stencil, G = depth
+        return any_r ? VK_IMAGE_ASPECT_STENCIL_BIT : VK_IMAGE_ASPECT_DEPTH_BIT;
     case PixelFormat::D16_UNORM:
     case PixelFormat::D32_FLOAT:
         return VK_IMAGE_ASPECT_DEPTH_BIT;
@@ -864,13 +867,19 @@ void TextureCacheRuntime::ReinterpretImage(Image& dst, Image& src,
     const VkImageAspectFlags src_aspect_mask = src.AspectMask();
     const VkImageAspectFlags dst_aspect_mask = dst.AspectMask();
 
-    std::ranges::transform(copies, vk_in_copies.begin(), [src_aspect_mask](const auto& copy) {
-        return MakeBufferImageCopy(copy, true, src_aspect_mask);
-    });
+    const auto bpp_in = BytesPerBlock(src.info.format) / DefaultBlockWidth(src.info.format);
+    const auto bpp_out = BytesPerBlock(dst.info.format) / DefaultBlockWidth(dst.info.format);
+    std::ranges::transform(copies, vk_in_copies.begin(),
+                           [src_aspect_mask, bpp_in, bpp_out](const auto& copy) {
+                               auto copy2 = copy;
+                               copy2.src_offset.x = (bpp_out * copy.src_offset.x) / bpp_in;
+                               copy2.extent.width = (bpp_out * copy.extent.width) / bpp_in;
+                               return MakeBufferImageCopy(copy2, true, src_aspect_mask);
+                           });
     std::ranges::transform(copies, vk_out_copies.begin(), [dst_aspect_mask](const auto& copy) {
         return MakeBufferImageCopy(copy, false, dst_aspect_mask);
     });
-    const u32 img_bpp = BytesPerBlock(src.info.format);
+    const u32 img_bpp = BytesPerBlock(dst.info.format);
     size_t total_size = 0;
     for (const auto& copy : copies) {
         total_size += copy.extent.width * copy.extent.height * copy.extent.depth * img_bpp;
@@ -1306,15 +1315,16 @@ Image::Image(const VideoCommon::NullImageParams& params) : VideoCommon::ImageBas
 
 Image::~Image() = default;
 
-void Image::UploadMemory(const StagingBufferRef& map, std::span<const BufferImageCopy> copies) {
+void Image::UploadMemory(VkBuffer buffer, VkDeviceSize offset,
+                         std::span<const VideoCommon::BufferImageCopy> copies) {
     // TODO: Move this to another API
     const bool is_rescaled = True(flags & ImageFlagBits::Rescaled);
     if (is_rescaled) {
         ScaleDown(true);
     }
     scheduler->RequestOutsideRenderPassOperationContext();
-    std::vector vk_copies = TransformBufferImageCopies(copies, map.offset, aspect_mask);
-    const VkBuffer src_buffer = map.buffer;
+    std::vector vk_copies = TransformBufferImageCopies(copies, offset, aspect_mask);
+    const VkBuffer src_buffer = buffer;
     const VkImage vk_image = *original_image;
     const VkImageAspectFlags vk_aspect_mask = aspect_mask;
     const bool is_initialized = std::exchange(initialized, true);
@@ -1327,14 +1337,19 @@ void Image::UploadMemory(const StagingBufferRef& map, std::span<const BufferImag
     }
 }
 
-void Image::DownloadMemory(const StagingBufferRef& map, std::span<const BufferImageCopy> copies) {
+void Image::UploadMemory(const StagingBufferRef& map, std::span<const BufferImageCopy> copies) {
+    UploadMemory(map.buffer, map.offset, copies);
+}
+
+void Image::DownloadMemory(VkBuffer buffer, VkDeviceSize offset,
+                           std::span<const VideoCommon::BufferImageCopy> copies) {
     const bool is_rescaled = True(flags & ImageFlagBits::Rescaled);
     if (is_rescaled) {
         ScaleDown();
     }
-    std::vector vk_copies = TransformBufferImageCopies(copies, map.offset, aspect_mask);
+    std::vector vk_copies = TransformBufferImageCopies(copies, offset, aspect_mask);
     scheduler->RequestOutsideRenderPassOperationContext();
-    scheduler->Record([buffer = map.buffer, image = *original_image, aspect_mask = aspect_mask,
+    scheduler->Record([buffer, image = *original_image, aspect_mask = aspect_mask,
                        vk_copies](vk::CommandBuffer cmdbuf) {
         const VkImageMemoryBarrier read_barrier{
             .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
@@ -1389,6 +1404,10 @@ void Image::DownloadMemory(const StagingBufferRef& map, std::span<const BufferIm
     }
 }
 
+void Image::DownloadMemory(const StagingBufferRef& map, std::span<const BufferImageCopy> copies) {
+    DownloadMemory(map.buffer, map.offset, copies);
+}
+
 bool Image::IsRescaled() const noexcept {
     return True(flags & ImageFlagBits::Rescaled);
 }
@@ -1763,7 +1782,7 @@ Sampler::Sampler(TextureCacheRuntime& runtime, const Tegra::Texture::TSCEntry& t
         .minLod = tsc.mipmap_filter == TextureMipmapFilter::None ? 0.0f : tsc.MinLod(),
         .maxLod = tsc.mipmap_filter == TextureMipmapFilter::None ? 0.25f : tsc.MaxLod(),
         .borderColor =
-            arbitrary_borders ? VK_BORDER_COLOR_INT_CUSTOM_EXT : ConvertBorderColor(color),
+            arbitrary_borders ? VK_BORDER_COLOR_FLOAT_CUSTOM_EXT : ConvertBorderColor(color),
         .unnormalizedCoordinates = VK_FALSE,
     });
 }
diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.h b/src/video_core/renderer_vulkan/vk_texture_cache.h
index 0ce39616f..d5ee23f8d 100644
--- a/src/video_core/renderer_vulkan/vk_texture_cache.h
+++ b/src/video_core/renderer_vulkan/vk_texture_cache.h
@@ -132,9 +132,15 @@ public:
     Image(Image&&) = default;
     Image& operator=(Image&&) = default;
 
+    void UploadMemory(VkBuffer buffer, VkDeviceSize offset,
+                      std::span<const VideoCommon::BufferImageCopy> copies);
+
     void UploadMemory(const StagingBufferRef& map,
                       std::span<const VideoCommon::BufferImageCopy> copies);
 
+    void DownloadMemory(VkBuffer buffer, VkDeviceSize offset,
+                        std::span<const VideoCommon::BufferImageCopy> copies);
+
     void DownloadMemory(const StagingBufferRef& map,
                         std::span<const VideoCommon::BufferImageCopy> copies);
 
diff --git a/src/video_core/renderer_vulkan/vk_update_descriptor.cpp b/src/video_core/renderer_vulkan/vk_update_descriptor.cpp
index 4d4a6753b..009dab0b6 100644
--- a/src/video_core/renderer_vulkan/vk_update_descriptor.cpp
+++ b/src/video_core/renderer_vulkan/vk_update_descriptor.cpp
@@ -25,7 +25,7 @@ void UpdateDescriptorQueue::TickFrame() {
 
 void UpdateDescriptorQueue::Acquire() {
     // Minimum number of entries required.
-    // This is the maximum number of entries a single draw call migth use.
+    // This is the maximum number of entries a single draw call might use.
     static constexpr size_t MIN_ENTRIES = 0x400;
 
     if (std::distance(payload.data(), payload_cursor) + MIN_ENTRIES >= payload.max_size()) {
diff --git a/src/video_core/texture_cache/format_lookup_table.cpp b/src/video_core/texture_cache/format_lookup_table.cpp
index 08aa8ca33..5fc2b2fec 100644
--- a/src/video_core/texture_cache/format_lookup_table.cpp
+++ b/src/video_core/texture_cache/format_lookup_table.cpp
@@ -42,15 +42,15 @@ PixelFormat PixelFormatFromTextureInfo(TextureFormat format, ComponentType red,
                                        ComponentType blue, ComponentType alpha,
                                        bool is_srgb) noexcept {
     switch (Hash(format, red, green, blue, alpha, is_srgb)) {
-    case Hash(TextureFormat::A8R8G8B8, UNORM):
+    case Hash(TextureFormat::A8B8G8R8, UNORM):
         return PixelFormat::A8B8G8R8_UNORM;
-    case Hash(TextureFormat::A8R8G8B8, SNORM):
+    case Hash(TextureFormat::A8B8G8R8, SNORM):
         return PixelFormat::A8B8G8R8_SNORM;
-    case Hash(TextureFormat::A8R8G8B8, UINT):
+    case Hash(TextureFormat::A8B8G8R8, UINT):
         return PixelFormat::A8B8G8R8_UINT;
-    case Hash(TextureFormat::A8R8G8B8, SINT):
+    case Hash(TextureFormat::A8B8G8R8, SINT):
         return PixelFormat::A8B8G8R8_SINT;
-    case Hash(TextureFormat::A8R8G8B8, UNORM, SRGB):
+    case Hash(TextureFormat::A8B8G8R8, UNORM, SRGB):
         return PixelFormat::A8B8G8R8_SRGB;
     case Hash(TextureFormat::B5G6R5, UNORM):
         return PixelFormat::B5G6R5_UNORM;
@@ -74,13 +74,13 @@ PixelFormat PixelFormatFromTextureInfo(TextureFormat format, ComponentType red,
         return PixelFormat::R8_UINT;
     case Hash(TextureFormat::R8, SINT):
         return PixelFormat::R8_SINT;
-    case Hash(TextureFormat::R8G8, UNORM):
+    case Hash(TextureFormat::G8R8, UNORM):
         return PixelFormat::R8G8_UNORM;
-    case Hash(TextureFormat::R8G8, SNORM):
+    case Hash(TextureFormat::G8R8, SNORM):
         return PixelFormat::R8G8_SNORM;
-    case Hash(TextureFormat::R8G8, UINT):
+    case Hash(TextureFormat::G8R8, UINT):
         return PixelFormat::R8G8_UINT;
-    case Hash(TextureFormat::R8G8, SINT):
+    case Hash(TextureFormat::G8R8, SINT):
         return PixelFormat::R8G8_SINT;
     case Hash(TextureFormat::R16G16B16A16, FLOAT):
         return PixelFormat::R16G16B16A16_FLOAT;
@@ -136,49 +136,49 @@ PixelFormat PixelFormatFromTextureInfo(TextureFormat format, ComponentType red,
         return PixelFormat::R32_SINT;
     case Hash(TextureFormat::E5B9G9R9, FLOAT):
         return PixelFormat::E5B9G9R9_FLOAT;
-    case Hash(TextureFormat::D32, FLOAT):
+    case Hash(TextureFormat::Z32, FLOAT):
         return PixelFormat::D32_FLOAT;
-    case Hash(TextureFormat::D16, UNORM):
+    case Hash(TextureFormat::Z16, UNORM):
         return PixelFormat::D16_UNORM;
-    case Hash(TextureFormat::S8D24, UINT, UNORM, UNORM, UNORM, LINEAR):
+    case Hash(TextureFormat::Z24S8, UINT, UNORM, UNORM, UNORM, LINEAR):
         return PixelFormat::S8_UINT_D24_UNORM;
-    case Hash(TextureFormat::S8D24, UINT, UNORM, UINT, UINT, LINEAR):
+    case Hash(TextureFormat::Z24S8, UINT, UNORM, UINT, UINT, LINEAR):
         return PixelFormat::S8_UINT_D24_UNORM;
-    case Hash(TextureFormat::R8G24, UINT, UNORM, UNORM, UNORM, LINEAR):
+    case Hash(TextureFormat::G24R8, UINT, UNORM, UNORM, UNORM, LINEAR):
         return PixelFormat::S8_UINT_D24_UNORM;
-    case Hash(TextureFormat::D24S8, UNORM, UINT, UINT, UINT, LINEAR):
+    case Hash(TextureFormat::S8Z24, UNORM, UINT, UINT, UINT, LINEAR):
         return PixelFormat::D24_UNORM_S8_UINT;
-    case Hash(TextureFormat::D32S8, FLOAT, UINT, UNORM, UNORM, LINEAR):
+    case Hash(TextureFormat::Z32_X24S8, FLOAT, UINT, UNORM, UNORM, LINEAR):
         return PixelFormat::D32_FLOAT_S8_UINT;
-    case Hash(TextureFormat::R32_B24G8, FLOAT, UINT, UNORM, UNORM, LINEAR):
+    case Hash(TextureFormat::R32B24G8, FLOAT, UINT, UNORM, UNORM, LINEAR):
         return PixelFormat::D32_FLOAT_S8_UINT;
-    case Hash(TextureFormat::BC1_RGBA, UNORM, LINEAR):
+    case Hash(TextureFormat::DXT1, UNORM, LINEAR):
         return PixelFormat::BC1_RGBA_UNORM;
-    case Hash(TextureFormat::BC1_RGBA, UNORM, SRGB):
+    case Hash(TextureFormat::DXT1, UNORM, SRGB):
         return PixelFormat::BC1_RGBA_SRGB;
-    case Hash(TextureFormat::BC2, UNORM, LINEAR):
+    case Hash(TextureFormat::DXT23, UNORM, LINEAR):
         return PixelFormat::BC2_UNORM;
-    case Hash(TextureFormat::BC2, UNORM, SRGB):
+    case Hash(TextureFormat::DXT23, UNORM, SRGB):
         return PixelFormat::BC2_SRGB;
-    case Hash(TextureFormat::BC3, UNORM, LINEAR):
+    case Hash(TextureFormat::DXT45, UNORM, LINEAR):
         return PixelFormat::BC3_UNORM;
-    case Hash(TextureFormat::BC3, UNORM, SRGB):
+    case Hash(TextureFormat::DXT45, UNORM, SRGB):
         return PixelFormat::BC3_SRGB;
-    case Hash(TextureFormat::BC4, UNORM):
+    case Hash(TextureFormat::DXN1, UNORM):
         return PixelFormat::BC4_UNORM;
-    case Hash(TextureFormat::BC4, SNORM):
+    case Hash(TextureFormat::DXN1, SNORM):
         return PixelFormat::BC4_SNORM;
-    case Hash(TextureFormat::BC5, UNORM):
+    case Hash(TextureFormat::DXN2, UNORM):
         return PixelFormat::BC5_UNORM;
-    case Hash(TextureFormat::BC5, SNORM):
+    case Hash(TextureFormat::DXN2, SNORM):
         return PixelFormat::BC5_SNORM;
-    case Hash(TextureFormat::BC7, UNORM, LINEAR):
+    case Hash(TextureFormat::BC7U, UNORM, LINEAR):
         return PixelFormat::BC7_UNORM;
-    case Hash(TextureFormat::BC7, UNORM, SRGB):
+    case Hash(TextureFormat::BC7U, UNORM, SRGB):
         return PixelFormat::BC7_SRGB;
-    case Hash(TextureFormat::BC6H_SFLOAT, FLOAT):
+    case Hash(TextureFormat::BC6H_S16, FLOAT):
         return PixelFormat::BC6H_SFLOAT;
-    case Hash(TextureFormat::BC6H_UFLOAT, FLOAT):
+    case Hash(TextureFormat::BC6H_U16, FLOAT):
         return PixelFormat::BC6H_UFLOAT;
     case Hash(TextureFormat::ASTC_2D_4X4, UNORM, LINEAR):
         return PixelFormat::ASTC_2D_4X4_UNORM;
diff --git a/src/video_core/texture_cache/image_base.h b/src/video_core/texture_cache/image_base.h
index e8fa592d2..329396bb6 100644
--- a/src/video_core/texture_cache/image_base.h
+++ b/src/video_core/texture_cache/image_base.h
@@ -25,7 +25,7 @@ enum class ImageFlagBits : u32 {
     Registered = 1 << 6,  ///< True when the image is registered
     Picked = 1 << 7,      ///< Temporary flag to mark the image as picked
     Remapped = 1 << 8,    ///< Image has been remapped.
-    Sparse = 1 << 9,      ///< Image has non continous submemory.
+    Sparse = 1 << 9,      ///< Image has non continuous submemory.
 
     // Garbage Collection Flags
     BadOverlap = 1 << 10, ///< This image overlaps other but doesn't fit, has higher
diff --git a/src/video_core/texture_cache/image_info.cpp b/src/video_core/texture_cache/image_info.cpp
index e9100091e..a1296b574 100644
--- a/src/video_core/texture_cache/image_info.cpp
+++ b/src/video_core/texture_cache/image_info.cpp
@@ -216,10 +216,51 @@ ImageInfo::ImageInfo(const Tegra::Engines::Fermi2D::Surface& config) noexcept {
             .height = config.height,
             .depth = 1,
         };
-        rescaleable = block.depth == 0;
-        rescaleable &= size.height > 256;
+        rescaleable = block.depth == 0 && size.height > 256;
         downscaleable = size.height > 512;
     }
 }
 
+static PixelFormat ByteSizeToFormat(u32 bytes_per_pixel) {
+    switch (bytes_per_pixel) {
+    case 1:
+        return PixelFormat::R8_UINT;
+    case 2:
+        return PixelFormat::R8G8_UINT;
+    case 4:
+        return PixelFormat::A8B8G8R8_UINT;
+    case 8:
+        return PixelFormat::R16G16B16A16_UINT;
+    case 16:
+        return PixelFormat::R32G32B32A32_UINT;
+    default:
+        UNIMPLEMENTED();
+        return PixelFormat::Invalid;
+    }
+}
+
+ImageInfo::ImageInfo(const Tegra::DMA::ImageOperand& config) noexcept {
+    const u32 bytes_per_pixel = config.bytes_per_pixel;
+    format = ByteSizeToFormat(bytes_per_pixel);
+    type = config.params.block_size.depth > 0 ? ImageType::e3D : ImageType::e2D;
+    num_samples = 1;
+    block = Extent3D{
+        .width = config.params.block_size.width,
+        .height = config.params.block_size.height,
+        .depth = config.params.block_size.depth,
+    };
+    size = Extent3D{
+        .width = config.params.width,
+        .height = config.params.height,
+        .depth = config.params.depth,
+    };
+    tile_width_spacing = 0;
+    resources.levels = 1;
+    resources.layers = 1;
+    layer_stride = CalculateLayerStride(*this);
+    maybe_unaligned_layer_stride = CalculateLayerSize(*this);
+    rescaleable = block.depth == 0 && size.height > 256;
+    downscaleable = size.height > 512;
+}
+
 } // namespace VideoCommon
diff --git a/src/video_core/texture_cache/image_info.h b/src/video_core/texture_cache/image_info.h
index 93755e15e..a12f5b44f 100644
--- a/src/video_core/texture_cache/image_info.h
+++ b/src/video_core/texture_cache/image_info.h
@@ -5,6 +5,7 @@
 
 #include "video_core/engines/fermi_2d.h"
 #include "video_core/engines/maxwell_3d.h"
+#include "video_core/engines/maxwell_dma.h"
 #include "video_core/surface.h"
 #include "video_core/texture_cache/types.h"
 
@@ -19,6 +20,7 @@ struct ImageInfo {
     explicit ImageInfo(const Tegra::Engines::Maxwell3D::Regs& regs, size_t index) noexcept;
     explicit ImageInfo(const Tegra::Engines::Maxwell3D::Regs& regs) noexcept;
     explicit ImageInfo(const Tegra::Engines::Fermi2D::Surface& config) noexcept;
+    explicit ImageInfo(const Tegra::DMA::ImageOperand& config) noexcept;
 
     PixelFormat format = PixelFormat::Invalid;
     ImageType type = ImageType::e1D;
diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h
index 9dd152fbe..8e8b9a5e6 100644
--- a/src/video_core/texture_cache/texture_cache.h
+++ b/src/video_core/texture_cache/texture_cache.h
@@ -745,6 +745,25 @@ void TextureCache<P>::PopAsyncFlushes() {
 }
 
 template <class P>
+ImageId TextureCache<P>::DmaImageId(const Tegra::DMA::ImageOperand& operand) {
+    const ImageInfo dst_info(operand);
+    const ImageId dst_id = FindDMAImage(dst_info, operand.address);
+    if (!dst_id) {
+        return NULL_IMAGE_ID;
+    }
+    const auto& image = slot_images[dst_id];
+    if (False(image.flags & ImageFlagBits::GpuModified)) {
+        // No need to waste time on an image that's synced with guest
+        return NULL_IMAGE_ID;
+    }
+    const auto base = image.TryFindBase(operand.address);
+    if (!base) {
+        return NULL_IMAGE_ID;
+    }
+    return dst_id;
+}
+
+template <class P>
 bool TextureCache<P>::IsRescaling() const noexcept {
     return is_rescaling;
 }
@@ -772,6 +791,49 @@ bool TextureCache<P>::IsRegionGpuModified(VAddr addr, size_t size) {
 }
 
 template <class P>
+std::pair<typename TextureCache<P>::Image*, BufferImageCopy> TextureCache<P>::DmaBufferImageCopy(
+    const Tegra::DMA::ImageCopy& copy_info, const Tegra::DMA::BufferOperand& buffer_operand,
+    const Tegra::DMA::ImageOperand& image_operand, ImageId image_id, bool modifies_image) {
+    const auto [level, base] = PrepareDmaImage(image_id, image_operand.address, modifies_image);
+    auto* image = &slot_images[image_id];
+    const u32 buffer_size = static_cast<u32>(buffer_operand.pitch * buffer_operand.height);
+    const u32 bpp = VideoCore::Surface::BytesPerBlock(image->info.format);
+    const auto convert = [old_bpp = image_operand.bytes_per_pixel, bpp](u32 value) {
+        return (old_bpp * value) / bpp;
+    };
+    const u32 base_x = convert(image_operand.params.origin.x.Value());
+    const u32 base_y = image_operand.params.origin.y.Value();
+    const u32 length_x = convert(copy_info.length_x);
+    const u32 length_y = copy_info.length_y;
+
+    const BufferImageCopy copy{
+        .buffer_offset = 0,
+        .buffer_size = buffer_size,
+        .buffer_row_length = convert(buffer_operand.pitch),
+        .buffer_image_height = buffer_operand.height,
+        .image_subresource =
+            {
+                .base_level = static_cast<s32>(level),
+                .base_layer = static_cast<s32>(base),
+                .num_layers = 1,
+            },
+        .image_offset =
+            {
+                .x = static_cast<s32>(base_x),
+                .y = static_cast<s32>(base_y),
+                .z = 0,
+            },
+        .image_extent =
+            {
+                .width = length_x,
+                .height = length_y,
+                .depth = 1,
+            },
+    };
+    return {image, copy};
+}
+
+template <class P>
 void TextureCache<P>::RefreshContents(Image& image, ImageId image_id) {
     if (False(image.flags & ImageFlagBits::CpuModified)) {
         // Only upload modified images
@@ -1359,6 +1421,63 @@ std::optional<typename TextureCache<P>::BlitImages> TextureCache<P>::GetBlitImag
 }
 
 template <class P>
+ImageId TextureCache<P>::FindDMAImage(const ImageInfo& info, GPUVAddr gpu_addr) {
+    std::optional<VAddr> cpu_addr = gpu_memory->GpuToCpuAddress(gpu_addr);
+    if (!cpu_addr) {
+        cpu_addr = gpu_memory->GpuToCpuAddress(gpu_addr, CalculateGuestSizeInBytes(info));
+        if (!cpu_addr) {
+            return ImageId{};
+        }
+    }
+    ImageId image_id{};
+    boost::container::small_vector<ImageId, 1> image_ids;
+    const auto lambda = [&](ImageId existing_image_id, ImageBase& existing_image) {
+        if (True(existing_image.flags & ImageFlagBits::Remapped)) {
+            return false;
+        }
+        if (info.type == ImageType::Linear || existing_image.info.type == ImageType::Linear)
+            [[unlikely]] {
+            const bool strict_size = True(existing_image.flags & ImageFlagBits::Strong);
+            const ImageInfo& existing = existing_image.info;
+            if (existing_image.gpu_addr == gpu_addr && existing.type == info.type &&
+                existing.pitch == info.pitch &&
+                IsPitchLinearSameSize(existing, info, strict_size) &&
+                IsViewCompatible(existing.format, info.format, false, true)) {
+                image_id = existing_image_id;
+                image_ids.push_back(existing_image_id);
+                return true;
+            }
+        } else if (IsSubCopy(info, existing_image, gpu_addr)) {
+            image_id = existing_image_id;
+            image_ids.push_back(existing_image_id);
+            return true;
+        }
+        return false;
+    };
+    ForEachImageInRegion(*cpu_addr, CalculateGuestSizeInBytes(info), lambda);
+    if (image_ids.size() <= 1) [[likely]] {
+        return image_id;
+    }
+    auto image_ids_compare = [this](ImageId a, ImageId b) {
+        auto& image_a = slot_images[a];
+        auto& image_b = slot_images[b];
+        return image_a.modification_tick < image_b.modification_tick;
+    };
+    return *std::ranges::max_element(image_ids, image_ids_compare);
+}
+
+template <class P>
+std::pair<u32, u32> TextureCache<P>::PrepareDmaImage(ImageId dst_id, GPUVAddr base_addr,
+                                                     bool mark_as_modified) {
+    const auto& image = slot_images[dst_id];
+    const auto base = image.TryFindBase(base_addr);
+    PrepareImage(dst_id, mark_as_modified, false);
+    const auto& new_image = slot_images[dst_id];
+    lru_cache.Touch(new_image.lru_index, frame_tick);
+    return std::make_pair(base->level, base->layer);
+}
+
+template <class P>
 SamplerId TextureCache<P>::FindSampler(const TSCEntry& config) {
     if (std::ranges::all_of(config.raw, [](u64 value) { return value == 0; })) {
         return NULL_SAMPLER_ID;
diff --git a/src/video_core/texture_cache/texture_cache_base.h b/src/video_core/texture_cache/texture_cache_base.h
index 013836933..5a5b4179c 100644
--- a/src/video_core/texture_cache/texture_cache_base.h
+++ b/src/video_core/texture_cache/texture_cache_base.h
@@ -209,6 +209,12 @@ public:
     /// Pop asynchronous downloads
     void PopAsyncFlushes();
 
+    [[nodiscard]] ImageId DmaImageId(const Tegra::DMA::ImageOperand& operand);
+
+    [[nodiscard]] std::pair<Image*, BufferImageCopy> DmaBufferImageCopy(
+        const Tegra::DMA::ImageCopy& copy_info, const Tegra::DMA::BufferOperand& buffer_operand,
+        const Tegra::DMA::ImageOperand& image_operand, ImageId image_id, bool modifies_image);
+
     /// Return true when a CPU region is modified from the GPU
     [[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size);
 
@@ -300,6 +306,8 @@ private:
     /// Remove joined images from the cache
     [[nodiscard]] ImageId JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VAddr cpu_addr);
 
+    [[nodiscard]] ImageId FindDMAImage(const ImageInfo& info, GPUVAddr gpu_addr);
+
     /// Return a blit image pair from the given guest blit parameters
     [[nodiscard]] std::optional<BlitImages> GetBlitImages(
         const Tegra::Engines::Fermi2D::Surface& dst, const Tegra::Engines::Fermi2D::Surface& src,
@@ -381,6 +389,9 @@ private:
     /// Returns true if the current clear parameters clear the whole image of a given image view
     [[nodiscard]] bool IsFullClear(ImageViewId id);
 
+    [[nodiscard]] std::pair<u32, u32> PrepareDmaImage(ImageId dst_id, GPUVAddr base_addr,
+                                                      bool mark_as_modified);
+
     bool ImageCanRescale(ImageBase& image);
     void InvalidateScale(Image& image);
     bool ScaleUp(Image& image);
diff --git a/src/video_core/texture_cache/types.h b/src/video_core/texture_cache/types.h
index 0453456b4..a0e10643f 100644
--- a/src/video_core/texture_cache/types.h
+++ b/src/video_core/texture_cache/types.h
@@ -54,6 +54,7 @@ enum class RelaxedOptions : u32 {
     Format = 1 << 1,
     Samples = 1 << 2,
     ForceBrokenViews = 1 << 3,
+    FormatBpp = 1 << 4,
 };
 DECLARE_ENUM_FLAG_OPERATORS(RelaxedOptions)
 
diff --git a/src/video_core/texture_cache/util.cpp b/src/video_core/texture_cache/util.cpp
index 697f86641..de37db684 100644
--- a/src/video_core/texture_cache/util.cpp
+++ b/src/video_core/texture_cache/util.cpp
@@ -743,6 +743,44 @@ std::vector<ImageCopy> MakeShrinkImageCopies(const ImageInfo& dst, const ImageIn
     return copies;
 }
 
+std::vector<ImageCopy> MakeReinterpretImageCopies(const ImageInfo& src, u32 up_scale,
+                                                  u32 down_shift) {
+    std::vector<ImageCopy> copies;
+    copies.reserve(src.resources.levels);
+    const bool is_3d = src.type == ImageType::e3D;
+    for (s32 level = 0; level < src.resources.levels; ++level) {
+        ImageCopy& copy = copies.emplace_back();
+        copy.src_subresource = SubresourceLayers{
+            .base_level = level,
+            .base_layer = 0,
+            .num_layers = src.resources.layers,
+        };
+        copy.dst_subresource = SubresourceLayers{
+            .base_level = level,
+            .base_layer = 0,
+            .num_layers = src.resources.layers,
+        };
+        copy.src_offset = Offset3D{
+            .x = 0,
+            .y = 0,
+            .z = 0,
+        };
+        copy.dst_offset = Offset3D{
+            .x = 0,
+            .y = 0,
+            .z = 0,
+        };
+        const Extent3D mip_size = AdjustMipSize(src.size, level);
+        copy.extent = AdjustSamplesSize(mip_size, src.num_samples);
+        if (is_3d) {
+            copy.extent.depth = src.size.depth;
+        }
+        copy.extent.width = std::max<u32>((copy.extent.width * up_scale) >> down_shift, 1);
+        copy.extent.height = std::max<u32>((copy.extent.height * up_scale) >> down_shift, 1);
+    }
+    return copies;
+}
+
 bool IsValidEntry(const Tegra::MemoryManager& gpu_memory, const TICEntry& config) {
     const GPUVAddr address = config.Address();
     if (address == 0) {
@@ -999,6 +1037,20 @@ bool IsBlockLinearSizeCompatible(const ImageInfo& lhs, const ImageInfo& rhs, u32
     }
 }
 
+bool IsBlockLinearSizeCompatibleBPPRelaxed(const ImageInfo& lhs, const ImageInfo& rhs,
+                                           u32 lhs_level, u32 rhs_level) noexcept {
+    ASSERT(lhs.type != ImageType::Linear);
+    ASSERT(rhs.type != ImageType::Linear);
+    const auto lhs_bpp = BytesPerBlock(lhs.format);
+    const auto rhs_bpp = BytesPerBlock(rhs.format);
+    const Extent3D lhs_size = AdjustMipSize(lhs.size, lhs_level);
+    const Extent3D rhs_size = AdjustMipSize(rhs.size, rhs_level);
+    return Common::AlignUpLog2(lhs_size.width * lhs_bpp, GOB_SIZE_X_SHIFT) ==
+               Common::AlignUpLog2(rhs_size.width * rhs_bpp, GOB_SIZE_X_SHIFT) &&
+           Common::AlignUpLog2(lhs_size.height, GOB_SIZE_Y_SHIFT) ==
+               Common::AlignUpLog2(rhs_size.height, GOB_SIZE_Y_SHIFT);
+}
+
 bool IsPitchLinearSameSize(const ImageInfo& lhs, const ImageInfo& rhs, bool strict_size) noexcept {
     ASSERT(lhs.type == ImageType::Linear);
     ASSERT(rhs.type == ImageType::Linear);
@@ -1073,7 +1125,8 @@ std::optional<SubresourceBase> FindSubresource(const ImageInfo& candidate, const
         // Format checking is relaxed, but we still have to check for matching bytes per block.
         // This avoids creating a view for blits on UE4 titles where formats with different bytes
         // per block are aliased.
-        if (BytesPerBlock(existing.format) != BytesPerBlock(candidate.format)) {
+        if (BytesPerBlock(existing.format) != BytesPerBlock(candidate.format) &&
+            False(options & RelaxedOptions::FormatBpp)) {
             return std::nullopt;
         }
     } else {
@@ -1088,10 +1141,8 @@ std::optional<SubresourceBase> FindSubresource(const ImageInfo& candidate, const
     if (existing.type != candidate.type) {
         return std::nullopt;
     }
-    if (False(options & RelaxedOptions::Samples)) {
-        if (existing.num_samples != candidate.num_samples) {
-            return std::nullopt;
-        }
+    if (False(options & RelaxedOptions::Samples) && existing.num_samples != candidate.num_samples) {
+        return std::nullopt;
     }
     if (existing.resources.levels < candidate.resources.levels + base->level) {
         return std::nullopt;
@@ -1101,14 +1152,16 @@ std::optional<SubresourceBase> FindSubresource(const ImageInfo& candidate, const
         if (mip_depth < candidate.size.depth + base->layer) {
             return std::nullopt;
         }
-    } else {
-        if (existing.resources.layers < candidate.resources.layers + base->layer) {
-            return std::nullopt;
-        }
+    } else if (existing.resources.layers < candidate.resources.layers + base->layer) {
+        return std::nullopt;
     }
     const bool strict_size = False(options & RelaxedOptions::Size);
     if (!IsBlockLinearSizeCompatible(existing, candidate, base->level, 0, strict_size)) {
-        return std::nullopt;
+        if (False(options & RelaxedOptions::FormatBpp)) {
+            return std::nullopt;
+        } else if (!IsBlockLinearSizeCompatibleBPPRelaxed(existing, candidate, base->level, 0)) {
+            return std::nullopt;
+        }
     }
     // TODO: compare block sizes
     return base;
@@ -1120,6 +1173,31 @@ bool IsSubresource(const ImageInfo& candidate, const ImageBase& image, GPUVAddr
         .has_value();
 }
 
+bool IsSubCopy(const ImageInfo& candidate, const ImageBase& image, GPUVAddr candidate_addr) {
+    const std::optional<SubresourceBase> base = image.TryFindBase(candidate_addr);
+    if (!base) {
+        return false;
+    }
+    const ImageInfo& existing = image.info;
+    if (existing.resources.levels < candidate.resources.levels + base->level) {
+        return false;
+    }
+    if (existing.type == ImageType::e3D) {
+        const u32 mip_depth = std::max(1U, existing.size.depth << base->level);
+        if (mip_depth < candidate.size.depth + base->layer) {
+            return false;
+        }
+    } else {
+        if (existing.resources.layers < candidate.resources.layers + base->layer) {
+            return false;
+        }
+    }
+    if (!IsBlockLinearSizeCompatibleBPPRelaxed(existing, candidate, base->level, 0)) {
+        return false;
+    }
+    return true;
+}
+
 void DeduceBlitImages(ImageInfo& dst_info, ImageInfo& src_info, const ImageBase* dst,
                       const ImageBase* src) {
     const auto original_dst_format = dst_info.format;
diff --git a/src/video_core/texture_cache/util.h b/src/video_core/texture_cache/util.h
index d103db8ae..84aa6880d 100644
--- a/src/video_core/texture_cache/util.h
+++ b/src/video_core/texture_cache/util.h
@@ -56,6 +56,10 @@ struct OverlapResult {
                                                            SubresourceBase base, u32 up_scale = 1,
                                                            u32 down_shift = 0);
 
+[[nodiscard]] std::vector<ImageCopy> MakeReinterpretImageCopies(const ImageInfo& src,
+                                                                u32 up_scale = 1,
+                                                                u32 down_shift = 0);
+
 [[nodiscard]] bool IsValidEntry(const Tegra::MemoryManager& gpu_memory, const TICEntry& config);
 
 [[nodiscard]] std::vector<BufferImageCopy> UnswizzleImage(Tegra::MemoryManager& gpu_memory,
@@ -88,6 +92,9 @@ void SwizzleImage(Tegra::MemoryManager& gpu_memory, GPUVAddr gpu_addr, const Ima
 [[nodiscard]] bool IsPitchLinearSameSize(const ImageInfo& lhs, const ImageInfo& rhs,
                                          bool strict_size) noexcept;
 
+[[nodiscard]] bool IsBlockLinearSizeCompatibleBPPRelaxed(const ImageInfo& lhs, const ImageInfo& rhs,
+                                                         u32 lhs_level, u32 rhs_level) noexcept;
+
 [[nodiscard]] std::optional<OverlapResult> ResolveOverlap(const ImageInfo& new_info,
                                                           GPUVAddr gpu_addr, VAddr cpu_addr,
                                                           const ImageBase& overlap,
@@ -106,6 +113,9 @@ void SwizzleImage(Tegra::MemoryManager& gpu_memory, GPUVAddr gpu_addr, const Ima
                                  GPUVAddr candidate_addr, RelaxedOptions options, bool broken_views,
                                  bool native_bgr);
 
+[[nodiscard]] bool IsSubCopy(const ImageInfo& candidate, const ImageBase& image,
+                             GPUVAddr candidate_addr);
+
 void DeduceBlitImages(ImageInfo& dst_info, ImageInfo& src_info, const ImageBase* dst,
                       const ImageBase* src);
 
diff --git a/src/video_core/textures/astc.cpp b/src/video_core/textures/astc.cpp
index 4381eed1d..a68bc0d77 100644
--- a/src/video_core/textures/astc.cpp
+++ b/src/video_core/textures/astc.cpp
@@ -1571,7 +1571,7 @@ static void DecompressBlock(std::span<const u8, 16> inBuf, const u32 blockWidth,
     assert(strm.GetBitsRead() + weightParams.GetPackedBitSize() == 128);
 
     // Decode both color data and texel weight data
-    u32 colorValues[32]; // Four values, two endpoints, four maximum paritions
+    u32 colorValues[32]; // Four values, two endpoints, four maximum partitions
     DecodeColorValues(colorValues, colorEndpointData, colorEndpointMode, nPartitions,
                       colorDataBits);
 
diff --git a/src/video_core/textures/texture.h b/src/video_core/textures/texture.h
index 7c4553a53..7e5837b20 100644
--- a/src/video_core/textures/texture.h
+++ b/src/video_core/textures/texture.h
@@ -15,26 +15,26 @@ enum class TextureFormat : u32 {
     R32G32B32 = 0x02,
     R16G16B16A16 = 0x03,
     R32G32 = 0x04,
-    R32_B24G8 = 0x05,
+    R32B24G8 = 0x05,
     ETC2_RGB = 0x06,
     X8B8G8R8 = 0x07,
-    A8R8G8B8 = 0x08,
+    A8B8G8R8 = 0x08,
     A2B10G10R10 = 0x09,
     ETC2_RGB_PTA = 0x0a,
     ETC2_RGBA = 0x0b,
     R16G16 = 0x0c,
-    R24G8 = 0x0d,
-    R8G24 = 0x0e,
+    G8R24 = 0x0d,
+    G24R8 = 0x0e,
     R32 = 0x0f,
-    BC6H_SFLOAT = 0x10,
-    BC6H_UFLOAT = 0x11,
+    BC6H_S16 = 0x10,
+    BC6H_U16 = 0x11,
     A4B4G4R4 = 0x12,
     A5B5G5R1 = 0x13,
     A1B5G5R5 = 0x14,
     B5G6R5 = 0x15,
     B6G5R5 = 0x16,
-    BC7 = 0x17,
-    R8G8 = 0x18,
+    BC7U = 0x17,
+    G8R8 = 0x18,
     EAC = 0x19,
     EACX2 = 0x1a,
     R16 = 0x1b,
@@ -46,33 +46,33 @@ enum class TextureFormat : u32 {
     B10G11R11 = 0x21,
     G8B8G8R8 = 0x22,
     B8G8R8G8 = 0x23,
-    BC1_RGBA = 0x24,
-    BC2 = 0x25,
-    BC3 = 0x26,
-    BC4 = 0x27,
-    BC5 = 0x28,
-    S8D24 = 0x29,
-    X8D24 = 0x2a,
-    D24S8 = 0x2b,
-    X4V4D24__COV4R4V = 0x2c,
-    X4V4D24__COV8R8V = 0x2d,
-    V8D24__COV4R12V = 0x2e,
-    D32 = 0x2f,
-    D32S8 = 0x30,
-    X8D24_X20V4S8__COV4R4V = 0x31,
-    X8D24_X20V4S8__COV8R8V = 0x32,
-    D32_X20V4X8__COV4R4V = 0x33,
-    D32_X20V4X8__COV8R8V = 0x34,
-    D32_X20V4S8__COV4R4V = 0x35,
-    D32_X20V4S8__COV8R8V = 0x36,
-    X8D24_X16V8S8__COV4R12V = 0x37,
-    D32_X16V8X8__COV4R12V = 0x38,
-    D32_X16V8S8__COV4R12V = 0x39,
-    D16 = 0x3a,
-    V8D24__COV8R24V = 0x3b,
-    X8D24_X16V8S8__COV8R24V = 0x3c,
-    D32_X16V8X8__COV8R24V = 0x3d,
-    D32_X16V8S8__COV8R24V = 0x3e,
+    DXT1 = 0x24,
+    DXT23 = 0x25,
+    DXT45 = 0x26,
+    DXN1 = 0x27,
+    DXN2 = 0x28,
+    Z24S8 = 0x29,
+    X8Z24 = 0x2a,
+    S8Z24 = 0x2b,
+    X4V4Z24__COV4R4V = 0x2c,
+    X4V4Z24__COV8R8V = 0x2d,
+    V8Z24__COV4R12V = 0x2e,
+    Z32 = 0x2f,
+    Z32_X24S8 = 0x30,
+    X8Z24_X20V4S8__COV4R4V = 0x31,
+    X8Z24_X20V4S8__COV8R8V = 0x32,
+    Z32_X20V4X8__COV4R4V = 0x33,
+    Z32_X20V4X8__COV8R8V = 0x34,
+    Z32_X20V4S8__COV4R4V = 0x35,
+    Z32_X20V4S8__COV8R8V = 0x36,
+    X8Z24_X16V8S8__COV4R12V = 0x37,
+    Z32_X16V8X8__COV4R12V = 0x38,
+    Z32_X16V8S8__COV4R12V = 0x39,
+    Z16 = 0x3a,
+    V8Z24__COV8R24V = 0x3b,
+    X8Z24_X16V8S8__COV8R24V = 0x3c,
+    Z32_X16V8X8__COV8R24V = 0x3d,
+    Z32_X16V8S8__COV8R24V = 0x3e,
     ASTC_2D_4X4 = 0x40,
     ASTC_2D_5X5 = 0x41,
     ASTC_2D_6X6 = 0x42,
diff --git a/src/video_core/vulkan_common/vulkan_device.cpp b/src/video_core/vulkan_common/vulkan_device.cpp
index 48f1a3d14..6f288b3f8 100644
--- a/src/video_core/vulkan_common/vulkan_device.cpp
+++ b/src/video_core/vulkan_common/vulkan_device.cpp
@@ -401,6 +401,12 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR
             loaded_extensions.erase(VK_EXT_EXTENDED_DYNAMIC_STATE_2_EXTENSION_NAME);
         }
     }
+    if (extensions.extended_dynamic_state3 && is_radv) {
+        LOG_WARNING(Render_Vulkan, "RADV has broken extendedDynamicState3ColorBlendEquation");
+        features.extended_dynamic_state3.extendedDynamicState3ColorBlendEnable = false;
+        features.extended_dynamic_state3.extendedDynamicState3ColorBlendEquation = false;
+        dynamic_state3_blending = false;
+    }
     if (extensions.vertex_input_dynamic_state && is_radv) {
         // TODO(ameerj): Blacklist only offending driver versions
         // TODO(ameerj): Confirm if RDNA1 is affected
@@ -417,7 +423,7 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR
 
     sets_per_pool = 64;
     if (is_amd_driver) {
-        // AMD drivers need a higher amount of Sets per Pool in certain circunstances like in XC2.
+        // AMD drivers need a higher amount of Sets per Pool in certain circumstances like in XC2.
         sets_per_pool = 96;
         // Disable VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT on AMD GCN4 and lower as it is broken.
         if (!features.shader_float16_int8.shaderFloat16) {
diff --git a/src/video_core/vulkan_common/vulkan_device.h b/src/video_core/vulkan_common/vulkan_device.h
index 0662a2d9f..41b5da18a 100644
--- a/src/video_core/vulkan_common/vulkan_device.h
+++ b/src/video_core/vulkan_common/vulkan_device.h
@@ -180,7 +180,7 @@ public:
     ~Device();
 
     /**
-     * Returns a format supported by the device for the passed requeriments.
+     * Returns a format supported by the device for the passed requirements.
      * @param wanted_format The ideal format to be returned. It may not be the returned format.
      * @param wanted_usage The usage that must be fulfilled even if the format is not supported.
      * @param format_type Format type usage.
@@ -259,12 +259,12 @@ public:
 
     bool ShouldBoostClocks() const;
 
-    /// Returns uniform buffer alignment requeriment.
+    /// Returns uniform buffer alignment requirement.
     VkDeviceSize GetUniformBufferAlignment() const {
         return properties.properties.limits.minUniformBufferOffsetAlignment;
     }
 
-    /// Returns storage alignment requeriment.
+    /// Returns storage alignment requirement.
     VkDeviceSize GetStorageBufferAlignment() const {
         return properties.properties.limits.minStorageBufferOffsetAlignment;
     }
@@ -656,7 +656,7 @@ private:
     bool is_integrated{};                   ///< Is GPU an iGPU.
     bool is_virtual{};                      ///< Is GPU a virtual GPU.
     bool is_non_gpu{};                      ///< Is SoftwareRasterizer, FPGA, non-GPU device.
-    bool has_broken_cube_compatibility{};   ///< Has broken cube compatiblity bit
+    bool has_broken_cube_compatibility{};   ///< Has broken cube compatibility bit
     bool has_renderdoc{};                   ///< Has RenderDoc attached
     bool has_nsight_graphics{};             ///< Has Nsight Graphics attached
     bool supports_d24_depth{};              ///< Supports D24 depth buffers.
diff --git a/src/video_core/vulkan_common/vulkan_wrapper.h b/src/video_core/vulkan_common/vulkan_wrapper.h
index e86f661cb..4ff328a21 100644
--- a/src/video_core/vulkan_common/vulkan_wrapper.h
+++ b/src/video_core/vulkan_common/vulkan_wrapper.h
@@ -68,7 +68,7 @@ public:
     constexpr Span(const Range& range) : ptr{std::data(range)}, num{std::size(range)} {}
 
     /// Construct a span from a pointer and a size.
-    /// This is inteded for subranges.
+    /// This is intended for subranges.
     constexpr Span(const T* ptr_, std::size_t num_) noexcept : ptr{ptr_}, num{num_} {}
 
     /// Returns the data pointer by the span.
@@ -390,11 +390,11 @@ public:
     Handle(const Handle&) = delete;
     Handle& operator=(const Handle&) = delete;
 
-    /// Construct a handle transfering the ownership from another handle.
+    /// Construct a handle transferring the ownership from another handle.
     Handle(Handle&& rhs) noexcept
         : handle{std::exchange(rhs.handle, nullptr)}, owner{rhs.owner}, dld{rhs.dld} {}
 
-    /// Assign the current handle transfering the ownership from another handle.
+    /// Assign the current handle transferring the ownership from another handle.
     /// Destroys any previously held object.
     Handle& operator=(Handle&& rhs) noexcept {
         Release();
@@ -463,10 +463,10 @@ public:
     Handle(const Handle&) = delete;
     Handle& operator=(const Handle&) = delete;
 
-    /// Construct a handle transfering ownership from another handle.
+    /// Construct a handle transferring ownership from another handle.
     Handle(Handle&& rhs) noexcept : handle{std::exchange(rhs.handle, nullptr)}, dld{rhs.dld} {}
 
-    /// Assign the current handle transfering the ownership from another handle.
+    /// Assign the current handle transferring the ownership from another handle.
     /// Destroys any previously held object.
     Handle& operator=(Handle&& rhs) noexcept {
         Release();
@@ -533,12 +533,12 @@ public:
     PoolAllocations(const PoolAllocations&) = delete;
     PoolAllocations& operator=(const PoolAllocations&) = delete;
 
-    /// Construct an allocation transfering ownership from another allocation.
+    /// Construct an allocation transferring ownership from another allocation.
     PoolAllocations(PoolAllocations&& rhs) noexcept
         : allocations{std::move(rhs.allocations)}, num{rhs.num}, device{rhs.device}, pool{rhs.pool},
           dld{rhs.dld} {}
 
-    /// Assign an allocation transfering ownership from another allocation.
+    /// Assign an allocation transferring ownership from another allocation.
     PoolAllocations& operator=(PoolAllocations&& rhs) noexcept {
         allocations = std::move(rhs.allocations);
         num = rhs.num;