39 files changed, 974 insertions, 247 deletions
diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt
index 9b0c3db68..9afc6105d 100644
--- a/src/common/CMakeLists.txt
+++ b/src/common/CMakeLists.txt
@@ -15,6 +15,10 @@ endif ()
 if (DEFINED ENV{DISPLAYVERSION})
   set(DISPLAY_VERSION $ENV{DISPLAYVERSION})
 endif ()
+
+# Pass the path to git to the GenerateSCMRev.cmake as well
+find_package(Git QUIET)
+
 add_custom_command(OUTPUT scm_rev.cpp
     COMMAND ${CMAKE_COMMAND}
       -DSRC_DIR="${CMAKE_SOURCE_DIR}"
@@ -23,6 +27,7 @@ add_custom_command(OUTPUT scm_rev.cpp
       -DTITLE_BAR_FORMAT_RUNNING="${TITLE_BAR_FORMAT_RUNNING}"
       -DBUILD_TAG="${BUILD_TAG}"
       -DBUILD_ID="${DISPLAY_VERSION}"
+      -DGIT_EXECUTABLE="${GIT_EXECUTABLE}"
       -P "${CMAKE_SOURCE_DIR}/CMakeModules/GenerateSCMRev.cmake"
     DEPENDS
       # WARNING! It was too much work to try and make a common location for this list,
diff --git a/src/common/telemetry.cpp b/src/common/telemetry.cpp
index f53a8d193..200c6489a 100644
--- a/src/common/telemetry.cpp
+++ b/src/common/telemetry.cpp
@@ -44,20 +44,6 @@ template class Field<std::string>;
 template class Field<const char*>;
 template class Field<std::chrono::microseconds>;
 
-#ifdef ARCHITECTURE_x86_64
-static const char* CpuVendorToStr(Common::CPUVendor vendor) {
-    switch (vendor) {
-    case Common::CPUVendor::INTEL:
-        return "Intel";
-    case Common::CPUVendor::AMD:
-        return "Amd";
-    case Common::CPUVendor::OTHER:
-        return "Other";
-    }
-    UNREACHABLE();
-}
-#endif
-
 void AppendBuildInfo(FieldCollection& fc) {
     const bool is_git_dirty{std::strstr(Common::g_scm_desc, "dirty") != nullptr};
     fc.AddField(FieldType::App, "Git_IsDirty", is_git_dirty);
@@ -71,7 +57,6 @@ void AppendCPUInfo(FieldCollection& fc) {
 #ifdef ARCHITECTURE_x86_64
     fc.AddField(FieldType::UserSystem, "CPU_Model", Common::GetCPUCaps().cpu_string);
     fc.AddField(FieldType::UserSystem, "CPU_BrandString", Common::GetCPUCaps().brand_string);
-    fc.AddField(FieldType::UserSystem, "CPU_Vendor", CpuVendorToStr(Common::GetCPUCaps().vendor));
     fc.AddField(FieldType::UserSystem, "CPU_Extension_x64_AES", Common::GetCPUCaps().aes);
     fc.AddField(FieldType::UserSystem, "CPU_Extension_x64_AVX", Common::GetCPUCaps().avx);
     fc.AddField(FieldType::UserSystem, "CPU_Extension_x64_AVX2", Common::GetCPUCaps().avx2);
diff --git a/src/common/x64/cpu_detect.cpp b/src/common/x64/cpu_detect.cpp
index 2dfcd39c8..c9349a6b4 100644
--- a/src/common/x64/cpu_detect.cpp
+++ b/src/common/x64/cpu_detect.cpp
@@ -3,8 +3,6 @@
 // Refer to the license.txt file included.
 
 #include <cstring>
-#include <string>
-#include <thread>
 #include "common/common_types.h"
 #include "common/x64/cpu_detect.h"
 
@@ -51,8 +49,6 @@ namespace Common {
 static CPUCaps Detect() {
     CPUCaps caps = {};
 
-    caps.num_cores = std::thread::hardware_concurrency();
-
     // Assumes the CPU supports the CPUID instruction. Those that don't would likely not support
     // yuzu at all anyway
 
@@ -70,12 +66,6 @@ static CPUCaps Detect() {
     __cpuid(cpu_id, 0x80000000);
 
     u32 max_ex_fn = cpu_id[0];
-    if (!strcmp(caps.brand_string, "GenuineIntel"))
-        caps.vendor = CPUVendor::INTEL;
-    else if (!strcmp(caps.brand_string, "AuthenticAMD"))
-        caps.vendor = CPUVendor::AMD;
-    else
-        caps.vendor = CPUVendor::OTHER;
 
     // Set reasonable default brand string even if brand string not available
     strcpy(caps.cpu_string, caps.brand_string);
@@ -96,15 +86,9 @@ static CPUCaps Detect() {
             caps.sse4_1 = true;
         if ((cpu_id[2] >> 20) & 1)
             caps.sse4_2 = true;
-        if ((cpu_id[2] >> 22) & 1)
-            caps.movbe = true;
         if ((cpu_id[2] >> 25) & 1)
             caps.aes = true;
 
-        if ((cpu_id[3] >> 24) & 1) {
-            caps.fxsave_fxrstor = true;
-        }
-
         // AVX support requires 3 separate checks:
         //  - Is the AVX bit set in CPUID?
         //  - Is the XSAVE bit set in CPUID?
@@ -129,8 +113,6 @@ static CPUCaps Detect() {
         }
     }
 
-    caps.flush_to_zero = caps.sse;
-
     if (max_ex_fn >= 0x80000004) {
         // Extract CPU model string
         __cpuid(cpu_id, 0x80000002);
@@ -144,14 +126,8 @@ static CPUCaps Detect() {
     if (max_ex_fn >= 0x80000001) {
         // Check for more features
         __cpuid(cpu_id, 0x80000001);
-        if (cpu_id[2] & 1)
-            caps.lahf_sahf_64 = true;
-        if ((cpu_id[2] >> 5) & 1)
-            caps.lzcnt = true;
         if ((cpu_id[2] >> 16) & 1)
             caps.fma4 = true;
-        if ((cpu_id[3] >> 29) & 1)
-            caps.long_mode = true;
     }
 
     return caps;
@@ -162,48 +138,4 @@ const CPUCaps& GetCPUCaps() {
     return caps;
 }
 
-std::string GetCPUCapsString() {
-    auto caps = GetCPUCaps();
-
-    std::string sum(caps.cpu_string);
-    sum += " (";
-    sum += caps.brand_string;
-    sum += ")";
-
-    if (caps.sse)
-        sum += ", SSE";
-    if (caps.sse2) {
-        sum += ", SSE2";
-        if (!caps.flush_to_zero)
-            sum += " (without DAZ)";
-    }
-
-    if (caps.sse3)
-        sum += ", SSE3";
-    if (caps.ssse3)
-        sum += ", SSSE3";
-    if (caps.sse4_1)
-        sum += ", SSE4.1";
-    if (caps.sse4_2)
-        sum += ", SSE4.2";
-    if (caps.avx)
-        sum += ", AVX";
-    if (caps.avx2)
-        sum += ", AVX2";
-    if (caps.bmi1)
-        sum += ", BMI1";
-    if (caps.bmi2)
-        sum += ", BMI2";
-    if (caps.fma)
-        sum += ", FMA";
-    if (caps.aes)
-        sum += ", AES";
-    if (caps.movbe)
-        sum += ", MOVBE";
-    if (caps.long_mode)
-        sum += ", 64-bit support";
-
-    return sum;
-}
-
 } // namespace Common
diff --git a/src/common/x64/cpu_detect.h b/src/common/x64/cpu_detect.h
index 0af3a8adb..20f2ba234 100644
--- a/src/common/x64/cpu_detect.h
+++ b/src/common/x64/cpu_detect.h
@@ -4,23 +4,12 @@
 
 #pragma once
 
-#include <string>
-
 namespace Common {
 
-/// x86/x64 CPU vendors that may be detected by this module
-enum class CPUVendor {
-    INTEL,
-    AMD,
-    OTHER,
-};
-
 /// x86/x64 CPU capabilities that may be detected by this module
 struct CPUCaps {
-    CPUVendor vendor;
     char cpu_string[0x21];
     char brand_string[0x41];
-    int num_cores;
     bool sse;
     bool sse2;
     bool sse3;
@@ -35,20 +24,6 @@ struct CPUCaps {
     bool fma;
     bool fma4;
     bool aes;
-
-    // Support for the FXSAVE and FXRSTOR instructions
-    bool fxsave_fxrstor;
-
-    bool movbe;
-
-    // This flag indicates that the hardware supports some mode in which denormal inputs and outputs
-    // are automatically set to (signed) zero.
-    bool flush_to_zero;
-
-    // Support for LAHF and SAHF instructions in 64-bit mode
-    bool lahf_sahf_64;
-
-    bool long_mode;
 };
 
 /**
@@ -57,10 +32,4 @@ struct CPUCaps {
  */
 const CPUCaps& GetCPUCaps();
 
-/**
- * Gets a string summary of the name and supported capabilities of the host CPU
- * @return String summary
- */
-std::string GetCPUCapsString();
-
 } // namespace Common
diff --git a/src/core/hle/kernel/physical_memory.h b/src/core/hle/kernel/physical_memory.h
index 090565310..b689e8e8b 100644
--- a/src/core/hle/kernel/physical_memory.h
+++ b/src/core/hle/kernel/physical_memory.h
@@ -14,6 +14,9 @@ namespace Kernel {
 // - Second to ensure all host backing memory used is aligned to 256 bytes due
 // to strict alignment restrictions on GPU memory.
 
-using PhysicalMemory = std::vector<u8, Common::AlignmentAllocator<u8, 256>>;
+using PhysicalMemoryVector = std::vector<u8, Common::AlignmentAllocator<u8, 256>>;
+class PhysicalMemory final : public PhysicalMemoryVector {
+    using PhysicalMemoryVector::PhysicalMemoryVector;
+};
 
 } // namespace Kernel
diff --git a/src/core/hle/kernel/process.cpp b/src/core/hle/kernel/process.cpp
index 12ea4ebe3..b9035a0be 100644
--- a/src/core/hle/kernel/process.cpp
+++ b/src/core/hle/kernel/process.cpp
@@ -317,6 +317,8 @@ void Process::FreeTLSRegion(VAddr tls_address) {
 }
 
 void Process::LoadModule(CodeSet module_, VAddr base_addr) {
+    code_memory_size += module_.memory.size();
+
     const auto memory = std::make_shared<PhysicalMemory>(std::move(module_.memory));
 
     const auto MapSegment = [&](const CodeSet::Segment& segment, VMAPermission permissions,
@@ -332,8 +334,6 @@ void Process::LoadModule(CodeSet module_, VAddr base_addr) {
     MapSegment(module_.CodeSegment(), VMAPermission::ReadExecute, MemoryState::Code);
     MapSegment(module_.RODataSegment(), VMAPermission::Read, MemoryState::CodeData);
     MapSegment(module_.DataSegment(), VMAPermission::ReadWrite, MemoryState::CodeData);
-
-    code_memory_size += module_.memory.size();
 }
 
 Process::Process(Core::System& system)
diff --git a/src/core/hle/kernel/vm_manager.cpp b/src/core/hle/kernel/vm_manager.cpp
index a9a20ef76..0b3500fce 100644
--- a/src/core/hle/kernel/vm_manager.cpp
+++ b/src/core/hle/kernel/vm_manager.cpp
@@ -3,6 +3,7 @@
 // Refer to the license.txt file included.
 
 #include <algorithm>
+#include <cstring>
 #include <iterator>
 #include <utility>
 #include "common/alignment.h"
@@ -269,18 +270,9 @@ ResultVal<VAddr> VMManager::SetHeapSize(u64 size) {
     // If necessary, expand backing vector to cover new heap extents in
     // the case of allocating. Otherwise, shrink the backing memory,
     // if a smaller heap has been requested.
-    const u64 old_heap_size = GetCurrentHeapSize();
-    if (size > old_heap_size) {
-        const u64 alloc_size = size - old_heap_size;
-
-        heap_memory->insert(heap_memory->end(), alloc_size, 0);
-        RefreshMemoryBlockMappings(heap_memory.get());
-    } else if (size < old_heap_size) {
-        heap_memory->resize(size);
-        heap_memory->shrink_to_fit();
-
-        RefreshMemoryBlockMappings(heap_memory.get());
-    }
+    heap_memory->resize(size);
+    heap_memory->shrink_to_fit();
+    RefreshMemoryBlockMappings(heap_memory.get());
 
     heap_end = heap_region_base + size;
     ASSERT(GetCurrentHeapSize() == heap_memory->size());
@@ -752,24 +744,20 @@ void VMManager::MergeAdjacentVMA(VirtualMemoryArea& left, const VirtualMemoryAre
     // Always merge allocated memory blocks, even when they don't share the same backing block.
     if (left.type == VMAType::AllocatedMemoryBlock &&
         (left.backing_block != right.backing_block || left.offset + left.size != right.offset)) {
-        const auto right_begin = right.backing_block->begin() + right.offset;
-        const auto right_end = right_begin + right.size;
 
         // Check if we can save work.
         if (left.offset == 0 && left.size == left.backing_block->size()) {
             // Fast case: left is an entire backing block.
-            left.backing_block->insert(left.backing_block->end(), right_begin, right_end);
+            left.backing_block->resize(left.size + right.size);
+            std::memcpy(left.backing_block->data() + left.size,
+                        right.backing_block->data() + right.offset, right.size);
         } else {
             // Slow case: make a new memory block for left and right.
-            const auto left_begin = left.backing_block->begin() + left.offset;
-            const auto left_end = left_begin + left.size;
-            const auto left_size = static_cast<std::size_t>(std::distance(left_begin, left_end));
-            const auto right_size = static_cast<std::size_t>(std::distance(right_begin, right_end));
-
             auto new_memory = std::make_shared<PhysicalMemory>();
-            new_memory->reserve(left_size + right_size);
-            new_memory->insert(new_memory->end(), left_begin, left_end);
-            new_memory->insert(new_memory->end(), right_begin, right_end);
+            new_memory->resize(left.size + right.size);
+            std::memcpy(new_memory->data(), left.backing_block->data() + left.offset, left.size);
+            std::memcpy(new_memory->data() + left.size, right.backing_block->data() + right.offset,
+                        right.size);
 
             left.backing_block = std::move(new_memory);
             left.offset = 0;
@@ -792,8 +780,7 @@ void VMManager::UpdatePageTableForVMA(const VirtualMemoryArea& vma) {
         memory.UnmapRegion(page_table, vma.base, vma.size);
         break;
     case VMAType::AllocatedMemoryBlock:
-        memory.MapMemoryRegion(page_table, vma.base, vma.size,
-                               vma.backing_block->data() + vma.offset);
+        memory.MapMemoryRegion(page_table, vma.base, vma.size, *vma.backing_block, vma.offset);
         break;
     case VMAType::BackingMemory:
         memory.MapMemoryRegion(page_table, vma.base, vma.size, vma.backing_memory);
diff --git a/src/core/loader/elf.cpp b/src/core/loader/elf.cpp
index f1795fdd6..8908e5328 100644
--- a/src/core/loader/elf.cpp
+++ b/src/core/loader/elf.cpp
@@ -335,7 +335,8 @@ Kernel::CodeSet ElfReader::LoadInto(VAddr vaddr) {
             codeset_segment->addr = segment_addr;
             codeset_segment->size = aligned_size;
 
-            memcpy(&program_image[current_image_position], GetSegmentPtr(i), p->p_filesz);
+            std::memcpy(program_image.data() + current_image_position, GetSegmentPtr(i),
+                        p->p_filesz);
             current_image_position += aligned_size;
         }
     }
diff --git a/src/core/loader/kip.cpp b/src/core/loader/kip.cpp
index 474b55cb1..092103abe 100644
--- a/src/core/loader/kip.cpp
+++ b/src/core/loader/kip.cpp
@@ -2,6 +2,7 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
+#include <cstring>
 #include "core/file_sys/kernel_executable.h"
 #include "core/file_sys/program_metadata.h"
 #include "core/gdbstub/gdbstub.h"
@@ -76,8 +77,8 @@ AppLoader::LoadResult AppLoader_KIP::Load(Kernel::Process& process) {
         segment.addr = offset;
         segment.offset = offset;
         segment.size = PageAlignSize(static_cast<u32>(data.size()));
-        program_image.resize(offset);
-        program_image.insert(program_image.end(), data.begin(), data.end());
+        program_image.resize(offset + data.size());
+        std::memcpy(program_image.data() + offset, data.data(), data.size());
     };
 
     load_segment(codeset.CodeSegment(), kip->GetTextSection(), kip->GetTextOffset());
diff --git a/src/core/loader/nso.cpp b/src/core/loader/nso.cpp
index f629892ae..515c5accb 100644
--- a/src/core/loader/nso.cpp
+++ b/src/core/loader/nso.cpp
@@ -3,6 +3,7 @@
 // Refer to the license.txt file included.
 
 #include <cinttypes>
+#include <cstring>
 #include <vector>
 
 #include "common/common_funcs.h"
@@ -96,8 +97,9 @@ std::optional<VAddr> AppLoader_NSO::LoadModule(Kernel::Process& process,
         if (nso_header.IsSegmentCompressed(i)) {
             data = DecompressSegment(data, nso_header.segments[i]);
         }
-        program_image.resize(nso_header.segments[i].location);
-        program_image.insert(program_image.end(), data.begin(), data.end());
+        program_image.resize(nso_header.segments[i].location + data.size());
+        std::memcpy(program_image.data() + nso_header.segments[i].location, data.data(),
+                    data.size());
         codeset.segments[i].addr = nso_header.segments[i].location;
         codeset.segments[i].offset = nso_header.segments[i].location;
         codeset.segments[i].size = PageAlignSize(static_cast<u32>(data.size()));
@@ -139,12 +141,12 @@ std::optional<VAddr> AppLoader_NSO::LoadModule(Kernel::Process& process,
         std::vector<u8> pi_header;
         pi_header.insert(pi_header.begin(), reinterpret_cast<u8*>(&nso_header),
                          reinterpret_cast<u8*>(&nso_header) + sizeof(NSOHeader));
-        pi_header.insert(pi_header.begin() + sizeof(NSOHeader), program_image.begin(),
-                         program_image.end());
+        pi_header.insert(pi_header.begin() + sizeof(NSOHeader), program_image.data(),
+                         program_image.data() + program_image.size());
 
         pi_header = pm->PatchNSO(pi_header, file.GetName());
 
-        std::copy(pi_header.begin() + sizeof(NSOHeader), pi_header.end(), program_image.begin());
+        std::copy(pi_header.begin() + sizeof(NSOHeader), pi_header.end(), program_image.data());
     }
 
     // Apply cheats if they exist and the program has a valid title ID
diff --git a/src/core/memory.cpp b/src/core/memory.cpp
index 3c2a29d9b..f0888327f 100644
--- a/src/core/memory.cpp
+++ b/src/core/memory.cpp
@@ -14,6 +14,7 @@
 #include "common/swap.h"
 #include "core/arm/arm_interface.h"
 #include "core/core.h"
+#include "core/hle/kernel/physical_memory.h"
 #include "core/hle/kernel/process.h"
 #include "core/hle/kernel/vm_manager.h"
 #include "core/memory.h"
@@ -38,6 +39,11 @@ struct Memory::Impl {
         system.ArmInterface(3).PageTableChanged(*current_page_table, address_space_width);
     }
 
+    void MapMemoryRegion(Common::PageTable& page_table, VAddr base, u64 size,
+                         Kernel::PhysicalMemory& memory, VAddr offset) {
+        MapMemoryRegion(page_table, base, size, memory.data() + offset);
+    }
+
     void MapMemoryRegion(Common::PageTable& page_table, VAddr base, u64 size, u8* target) {
         ASSERT_MSG((size & PAGE_MASK) == 0, "non-page aligned size: {:016X}", size);
         ASSERT_MSG((base & PAGE_MASK) == 0, "non-page aligned base: {:016X}", base);
@@ -601,6 +607,11 @@ void Memory::SetCurrentPageTable(Kernel::Process& process) {
     impl->SetCurrentPageTable(process);
 }
 
+void Memory::MapMemoryRegion(Common::PageTable& page_table, VAddr base, u64 size,
+                             Kernel::PhysicalMemory& memory, VAddr offset) {
+    impl->MapMemoryRegion(page_table, base, size, memory, offset);
+}
+
 void Memory::MapMemoryRegion(Common::PageTable& page_table, VAddr base, u64 size, u8* target) {
     impl->MapMemoryRegion(page_table, base, size, target);
 }
diff --git a/src/core/memory.h b/src/core/memory.h
index 1428a6d60..8913a9da4 100644
--- a/src/core/memory.h
+++ b/src/core/memory.h
@@ -19,8 +19,9 @@ class System;
 }
 
 namespace Kernel {
+class PhysicalMemory;
 class Process;
-}
+} // namespace Kernel
 
 namespace Memory {
 
@@ -66,6 +67,19 @@ public:
     void SetCurrentPageTable(Kernel::Process& process);
 
     /**
+     * Maps an physical buffer onto a region of the emulated process address space.
+     *
+     * @param page_table The page table of the emulated process.
+     * @param base       The address to start mapping at. Must be page-aligned.
+     * @param size       The amount of bytes to map. Must be page-aligned.
+     * @param memory     Physical buffer with the memory backing the mapping. Must be of length
+     *                   at least `size + offset`.
+     * @param offset     The offset within the physical memory. Must be page-aligned.
+     */
+    void MapMemoryRegion(Common::PageTable& page_table, VAddr base, u64 size,
+                         Kernel::PhysicalMemory& memory, VAddr offset);
+
+    /**
      * Maps an allocated buffer onto a region of the emulated process address space.
      *
      * @param page_table The page table of the emulated process.
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index 142852082..729ee4a01 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -190,8 +190,11 @@ if (ENABLE_VULKAN)
         renderer_vulkan/vk_stream_buffer.h
         renderer_vulkan/vk_swapchain.cpp
         renderer_vulkan/vk_swapchain.h
+        renderer_vulkan/vk_texture_cache.cpp
+        renderer_vulkan/vk_texture_cache.h
         renderer_vulkan/vk_update_descriptor.cpp
-        renderer_vulkan/vk_update_descriptor.h)
+        renderer_vulkan/vk_update_descriptor.h
+    )
 
     target_include_directories(video_core PRIVATE sirit ../../externals/Vulkan-Headers/include)
     target_compile_definitions(video_core PRIVATE HAS_VULKAN)
diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h
index a35e7a195..ee79260fc 100644
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -1018,7 +1018,14 @@ public:
                     }
                 } instanced_arrays;
 
-                INSERT_UNION_PADDING_WORDS(0x6);
+                INSERT_UNION_PADDING_WORDS(0x4);
+
+                union {
+                    BitField<0, 1, u32> enable;
+                    BitField<4, 8, u32> unk4;
+                } vp_point_size;
+
+                INSERT_UNION_PADDING_WORDS(1);
 
                 Cull cull;
 
@@ -1271,8 +1278,6 @@ public:
 
     } dirty{};
 
-    std::array<u8, Regs::NUM_REGS> dirty_pointers{};
-
     /// Reads a register value located at the input method address
     u32 GetRegisterValue(u32 method) const;
 
@@ -1367,6 +1372,8 @@ private:
 
     bool execute_on{true};
 
+    std::array<u8, Regs::NUM_REGS> dirty_pointers{};
+
     /// Retrieves information about a specific TIC entry from the TIC buffer.
     Texture::TICEntry GetTICEntry(u32 tic_index) const;
 
@@ -1503,6 +1510,7 @@ ASSERT_REG_POSITION(primitive_restart, 0x591);
 ASSERT_REG_POSITION(index_array, 0x5F2);
 ASSERT_REG_POSITION(polygon_offset_clamp, 0x61F);
 ASSERT_REG_POSITION(instanced_arrays, 0x620);
+ASSERT_REG_POSITION(vp_point_size, 0x644);
 ASSERT_REG_POSITION(cull, 0x646);
 ASSERT_REG_POSITION(pixel_center_integer, 0x649);
 ASSERT_REG_POSITION(viewport_transform_enabled, 0x64B);
diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h
index 57b57c647..6f98bd827 100644
--- a/src/video_core/engines/shader_bytecode.h
+++ b/src/video_core/engines/shader_bytecode.h
@@ -215,6 +215,18 @@ enum class F2fRoundingOp : u64 {
     Trunc = 11,
 };
 
+enum class AtomicOp : u64 {
+    Add = 0,
+    Min = 1,
+    Max = 2,
+    Inc = 3,
+    Dec = 4,
+    And = 5,
+    Or = 6,
+    Xor = 7,
+    Exch = 8,
+};
+
 enum class UniformType : u64 {
     UnsignedByte = 0,
     SignedByte = 1,
@@ -236,6 +248,13 @@ enum class StoreType : u64 {
     Bits128 = 6,
 };
 
+enum class AtomicType : u64 {
+    U32 = 0,
+    S32 = 1,
+    U64 = 2,
+    S64 = 3,
+};
+
 enum class IMinMaxExchange : u64 {
     None = 0,
     XLo = 1,
@@ -939,6 +958,16 @@ union Instruction {
     } stg;
 
     union {
+        BitField<52, 4, AtomicOp> operation;
+        BitField<28, 2, AtomicType> type;
+        BitField<30, 22, s64> offset;
+
+        s32 GetImmediateOffset() const {
+            return static_cast<s32>(offset << 2);
+        }
+    } atoms;
+
+    union {
         BitField<32, 1, PhysicalAttributeDirection> direction;
         BitField<47, 3, AttributeSize> size;
         BitField<20, 11, u64> address;
@@ -1659,9 +1688,10 @@ public:
         ST_A,
         ST_L,
         ST_S,
-        ST,   // Store in generic memory
-        STG,  // Store in global memory
-        AL2P, // Transforms attribute memory into physical memory
+        ST,    // Store in generic memory
+        STG,   // Store in global memory
+        ATOMS, // Atomic operation on shared memory
+        AL2P,  // Transforms attribute memory into physical memory
         TEX,
         TEX_B,  // Texture Load Bindless
         TXQ,    // Texture Query
@@ -1964,6 +1994,7 @@ private:
             INST("1110111101010---", Id::ST_L, Type::Memory, "ST_L"),
             INST("101-------------", Id::ST, Type::Memory, "ST"),
             INST("1110111011011---", Id::STG, Type::Memory, "STG"),
+            INST("11101100--------", Id::ATOMS, Type::Memory, "ATOMS"),
             INST("1110111110100---", Id::AL2P, Type::Memory, "AL2P"),
             INST("110000----111---", Id::TEX, Type::Texture, "TEX"),
             INST("1101111010111---", Id::TEX_B, Type::Texture, "TEX_B"),
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index 672051102..926bccd42 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -1272,6 +1272,7 @@ void RasterizerOpenGL::SyncPointState() {
     const auto& regs = system.GPU().Maxwell3D().regs;
     // Limit the point size to 1 since nouveau sometimes sets a point size of 0 (and that's invalid
     // in OpenGL).
+    state.point.program_control = regs.vp_point_size.enable ? GL_TRUE : GL_FALSE;
     state.point.size = std::max(1.0f, regs.point_size);
 }
 
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp
index de742d11c..a4acb3796 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp
@@ -34,9 +34,6 @@ using VideoCommon::Shader::ShaderIR;
 
 namespace {
 
-// One UBO is always reserved for emulation values on staged shaders
-constexpr u32 STAGE_RESERVED_UBOS = 1;
-
 constexpr u32 STAGE_MAIN_OFFSET = 10;
 constexpr u32 KERNEL_MAIN_OFFSET = 0;
 
@@ -243,7 +240,6 @@ CachedProgram BuildShader(const Device& device, u64 unique_identifier, ShaderTyp
     if (!code_b.empty()) {
         ir_b.emplace(code_b, main_offset, COMPILER_SETTINGS, locker);
     }
-    const auto entries = GLShader::GetEntries(ir);
 
     std::string source = fmt::format(R"(// {}
 #version 430 core
@@ -314,9 +310,10 @@ std::unordered_set<GLenum> GetSupportedFormats() {
 
 CachedShader::CachedShader(const ShaderParameters& params, ShaderType shader_type,
                            GLShader::ShaderEntries entries, ProgramCode code, ProgramCode code_b)
-    : RasterizerCacheObject{params.host_ptr}, system{params.system}, disk_cache{params.disk_cache},
-      device{params.device}, cpu_addr{params.cpu_addr}, unique_identifier{params.unique_identifier},
-      shader_type{shader_type}, entries{entries}, code{std::move(code)}, code_b{std::move(code_b)} {
+    : RasterizerCacheObject{params.host_ptr}, system{params.system},
+      disk_cache{params.disk_cache}, device{params.device}, cpu_addr{params.cpu_addr},
+      unique_identifier{params.unique_identifier}, shader_type{shader_type},
+      entries{std::move(entries)}, code{std::move(code)}, code_b{std::move(code_b)} {
     if (!params.precompiled_variants) {
         return;
     }
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
index f9f7a97b5..19751939a 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -1856,6 +1856,16 @@ private:
                 Type::Uint};
     }
 
+    template <const std::string_view& opname, Type type>
+    Expression Atomic(Operation operation) {
+        ASSERT(stage == ShaderType::Compute);
+        auto& smem = std::get<SmemNode>(*operation[0]);
+
+        return {fmt::format("atomic{}(smem[{} >> 2], {})", opname, Visit(smem.GetAddress()).AsInt(),
+                            Visit(operation[1]).As(type)),
+                type};
+    }
+
     Expression Branch(Operation operation) {
         const auto target = std::get_if<ImmediateNode>(&*operation[0]);
         UNIMPLEMENTED_IF(!target);
@@ -2194,6 +2204,8 @@ private:
         &GLSLDecompiler::AtomicImage<Func::Xor>,
         &GLSLDecompiler::AtomicImage<Func::Exchange>,
 
+        &GLSLDecompiler::Atomic<Func::Add, Type::Uint>,
+
         &GLSLDecompiler::Branch,
         &GLSLDecompiler::BranchIndirect,
         &GLSLDecompiler::PushFlowStack,
diff --git a/src/video_core/renderer_opengl/gl_state.cpp b/src/video_core/renderer_opengl/gl_state.cpp
index df2e2395a..cc185e9e1 100644
--- a/src/video_core/renderer_opengl/gl_state.cpp
+++ b/src/video_core/renderer_opengl/gl_state.cpp
@@ -127,6 +127,7 @@ void OpenGLState::ApplyClipDistances() {
 }
 
 void OpenGLState::ApplyPointSize() {
+    Enable(GL_PROGRAM_POINT_SIZE, cur_state.point.program_control, point.program_control);
     if (UpdateValue(cur_state.point.size, point.size)) {
         glPointSize(point.size);
     }
diff --git a/src/video_core/renderer_opengl/gl_state.h b/src/video_core/renderer_opengl/gl_state.h
index fb180f302..71d418776 100644
--- a/src/video_core/renderer_opengl/gl_state.h
+++ b/src/video_core/renderer_opengl/gl_state.h
@@ -131,7 +131,8 @@ public:
     std::array<Viewport, Tegra::Engines::Maxwell3D::Regs::NumViewports> viewports;
 
     struct {
-        float size = 1.0f; // GL_POINT_SIZE
+        GLboolean program_control = GL_FALSE; // GL_PROGRAM_POINT_SIZE
+        GLfloat size = 1.0f;                  // GL_POINT_SIZE
     } point;
 
     struct {
diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp
index b790b0ef4..e95eb069e 100644
--- a/src/video_core/renderer_opengl/gl_texture_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp
@@ -44,7 +44,7 @@ struct FormatTuple {
 
 constexpr std::array<FormatTuple, VideoCore::Surface::MaxPixelFormat> tex_format_tuples = {{
     {GL_RGBA8, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, false},                        // ABGR8U
-    {GL_RGBA8, GL_RGBA, GL_BYTE, false},                                            // ABGR8S
+    {GL_RGBA8_SNORM, GL_RGBA, GL_BYTE, false},                                      // ABGR8S
     {GL_RGBA8UI, GL_RGBA_INTEGER, GL_UNSIGNED_BYTE, false},                         // ABGR8UI
     {GL_RGB565, GL_RGB, GL_UNSIGNED_SHORT_5_6_5_REV, false},                        // B5G6R5U
     {GL_RGB10_A2, GL_RGBA, GL_UNSIGNED_INT_2_10_10_10_REV, false},                  // A2B10G10R10U
@@ -83,9 +83,9 @@ constexpr std::array<FormatTuple, VideoCore::Surface::MaxPixelFormat> tex_format
     {GL_RGB32F, GL_RGB, GL_FLOAT, false},                                           // RGB32F
     {GL_SRGB8_ALPHA8, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, false},                 // RGBA8_SRGB
     {GL_RG8, GL_RG, GL_UNSIGNED_BYTE, false},                                       // RG8U
-    {GL_RG8, GL_RG, GL_BYTE, false},                                                // RG8S
+    {GL_RG8_SNORM, GL_RG, GL_BYTE, false},                                          // RG8S
     {GL_RG32UI, GL_RG_INTEGER, GL_UNSIGNED_INT, false},                             // RG32UI
-    {GL_RGB16F, GL_RGBA16, GL_HALF_FLOAT, false},                                   // RGBX16F
+    {GL_RGB16F, GL_RGBA, GL_HALF_FLOAT, false},                                     // RGBX16F
     {GL_R32UI, GL_RED_INTEGER, GL_UNSIGNED_INT, false},                             // R32UI
     {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE, false},                                   // ASTC_2D_8X8
     {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE, false},                                   // ASTC_2D_8X5
@@ -253,14 +253,12 @@ void CachedSurface::DownloadTexture(std::vector<u8>& staging_buffer) {
         glPixelStorei(GL_PACK_ALIGNMENT, std::min(8U, params.GetRowAlignment(level)));
         glPixelStorei(GL_PACK_ROW_LENGTH, static_cast<GLint>(params.GetMipWidth(level)));
         const std::size_t mip_offset = params.GetHostMipmapLevelOffset(level);
+        u8* const mip_data = staging_buffer.data() + mip_offset;
+        const GLsizei size = static_cast<GLsizei>(params.GetHostMipmapSize(level));
         if (is_compressed) {
-            glGetCompressedTextureImage(texture.handle, level,
-                                        static_cast<GLsizei>(params.GetHostMipmapSize(level)),
-                                        staging_buffer.data() + mip_offset);
+            glGetCompressedTextureImage(texture.handle, level, size, mip_data);
         } else {
-            glGetTextureImage(texture.handle, level, format, type,
-                              static_cast<GLsizei>(params.GetHostMipmapSize(level)),
-                              staging_buffer.data() + mip_offset);
+            glGetTextureImage(texture.handle, level, format, type, size, mip_data);
         }
     }
 }
diff --git a/src/video_core/renderer_opengl/utils.cpp b/src/video_core/renderer_opengl/utils.cpp
index 9770dda1c..ac99e6385 100644
--- a/src/video_core/renderer_opengl/utils.cpp
+++ b/src/video_core/renderer_opengl/utils.cpp
@@ -6,16 +6,20 @@
 #include <vector>
 
 #include <fmt/format.h>
-
 #include <glad/glad.h>
 
-#include "common/assert.h"
 #include "common/common_types.h"
-#include "common/scope_exit.h"
 #include "video_core/renderer_opengl/utils.h"
 
 namespace OpenGL {
 
+struct VertexArrayPushBuffer::Entry {
+    GLuint binding_index{};
+    const GLuint* buffer{};
+    GLintptr offset{};
+    GLsizei stride{};
+};
+
 VertexArrayPushBuffer::VertexArrayPushBuffer() = default;
 
 VertexArrayPushBuffer::~VertexArrayPushBuffer() = default;
@@ -47,6 +51,13 @@ void VertexArrayPushBuffer::Bind() {
     }
 }
 
+struct BindBuffersRangePushBuffer::Entry {
+    GLuint binding;
+    const GLuint* buffer;
+    GLintptr offset;
+    GLsizeiptr size;
+};
+
 BindBuffersRangePushBuffer::BindBuffersRangePushBuffer(GLenum target) : target{target} {}
 
 BindBuffersRangePushBuffer::~BindBuffersRangePushBuffer() = default;
diff --git a/src/video_core/renderer_opengl/utils.h b/src/video_core/renderer_opengl/utils.h
index d56153fe7..3ad7c02d4 100644
--- a/src/video_core/renderer_opengl/utils.h
+++ b/src/video_core/renderer_opengl/utils.h
@@ -26,12 +26,7 @@ public:
     void Bind();
 
 private:
-    struct Entry {
-        GLuint binding_index{};
-        const GLuint* buffer{};
-        GLintptr offset{};
-        GLsizei stride{};
-    };
+    struct Entry;
 
     GLuint vao{};
     const GLuint* index_buffer{};
@@ -50,12 +45,7 @@ public:
     void Bind();
 
 private:
-    struct Entry {
-        GLuint binding;
-        const GLuint* buffer;
-        GLintptr offset;
-        GLsizeiptr size;
-    };
+    struct Entry;
 
     GLenum target;
     std::vector<Entry> entries;
diff --git a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
index 000e3616d..331808113 100644
--- a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
+++ b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
@@ -44,7 +44,7 @@ vk::SamplerMipmapMode MipmapMode(Tegra::Texture::TextureMipmapFilter mipmap_filt
     return {};
 }
 
-vk::SamplerAddressMode WrapMode(Tegra::Texture::WrapMode wrap_mode,
+vk::SamplerAddressMode WrapMode(const VKDevice& device, Tegra::Texture::WrapMode wrap_mode,
                                 Tegra::Texture::TextureFilter filter) {
     switch (wrap_mode) {
     case Tegra::Texture::WrapMode::Wrap:
@@ -56,7 +56,12 @@ vk::SamplerAddressMode WrapMode(Tegra::Texture::WrapMode wrap_mode,
     case Tegra::Texture::WrapMode::Border:
         return vk::SamplerAddressMode::eClampToBorder;
     case Tegra::Texture::WrapMode::Clamp:
-        // TODO(Rodrigo): Emulate GL_CLAMP properly
+        if (device.GetDriverID() == vk::DriverIdKHR::eNvidiaProprietary) {
+            // Nvidia's Vulkan driver defaults to GL_CLAMP on invalid enumerations, we can hack this
+            // by sending an invalid enumeration.
+            return static_cast<vk::SamplerAddressMode>(0xcafe);
+        }
+        // TODO(Rodrigo): Emulate GL_CLAMP properly on other vendors
         switch (filter) {
         case Tegra::Texture::TextureFilter::Nearest:
             return vk::SamplerAddressMode::eClampToEdge;
diff --git a/src/video_core/renderer_vulkan/maxwell_to_vk.h b/src/video_core/renderer_vulkan/maxwell_to_vk.h
index 1534b738b..7e9678b7b 100644
--- a/src/video_core/renderer_vulkan/maxwell_to_vk.h
+++ b/src/video_core/renderer_vulkan/maxwell_to_vk.h
@@ -22,7 +22,7 @@ vk::Filter Filter(Tegra::Texture::TextureFilter filter);
 
 vk::SamplerMipmapMode MipmapMode(Tegra::Texture::TextureMipmapFilter mipmap_filter);
 
-vk::SamplerAddressMode WrapMode(Tegra::Texture::WrapMode wrap_mode,
+vk::SamplerAddressMode WrapMode(const VKDevice& device, Tegra::Texture::WrapMode wrap_mode,
                                 Tegra::Texture::TextureFilter filter);
 
 vk::CompareOp DepthCompareFunction(Tegra::Texture::DepthCompareFunc depth_compare_func);
diff --git a/src/video_core/renderer_vulkan/vk_sampler_cache.cpp b/src/video_core/renderer_vulkan/vk_sampler_cache.cpp
index 1ce583f75..0a8ec8398 100644
--- a/src/video_core/renderer_vulkan/vk_sampler_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_sampler_cache.cpp
@@ -46,9 +46,9 @@ UniqueSampler VKSamplerCache::CreateSampler(const Tegra::Texture::TSCEntry& tsc)
         {}, MaxwellToVK::Sampler::Filter(tsc.mag_filter),
         MaxwellToVK::Sampler::Filter(tsc.min_filter),
         MaxwellToVK::Sampler::MipmapMode(tsc.mipmap_filter),
-        MaxwellToVK::Sampler::WrapMode(tsc.wrap_u, tsc.mag_filter),
-        MaxwellToVK::Sampler::WrapMode(tsc.wrap_v, tsc.mag_filter),
-        MaxwellToVK::Sampler::WrapMode(tsc.wrap_p, tsc.mag_filter), tsc.GetLodBias(),
+        MaxwellToVK::Sampler::WrapMode(device, tsc.wrap_u, tsc.mag_filter),
+        MaxwellToVK::Sampler::WrapMode(device, tsc.wrap_v, tsc.mag_filter),
+        MaxwellToVK::Sampler::WrapMode(device, tsc.wrap_p, tsc.mag_filter), tsc.GetLodBias(),
         has_anisotropy, max_anisotropy, tsc.depth_compare_enabled,
         MaxwellToVK::Sampler::DepthCompareFunction(tsc.depth_compare_func), tsc.GetMinLod(),
         tsc.GetMaxLod(), vk_border_color.value_or(vk::BorderColor::eFloatTransparentBlack),
diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
index 8fe852ce8..0cf97cafa 100644
--- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
+++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
@@ -1796,6 +1796,11 @@ private:
         return {};
     }
 
+    Expression UAtomicAdd(Operation) {
+        UNIMPLEMENTED();
+        return {};
+    }
+
     Expression Branch(Operation operation) {
         const auto& target = std::get<ImmediateNode>(*operation[0]);
         OpStore(jmp_to, Constant(t_uint, target.GetValue()));
@@ -2373,6 +2378,8 @@ private:
         &SPIRVDecompiler::AtomicImageXor,
         &SPIRVDecompiler::AtomicImageExchange,
 
+        &SPIRVDecompiler::UAtomicAdd,
+
         &SPIRVDecompiler::Branch,
         &SPIRVDecompiler::BranchIndirect,
         &SPIRVDecompiler::PushFlowStack,
diff --git a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.h b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.h
index 02310375f..4d9488f49 100644
--- a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.h
+++ b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.h
@@ -13,6 +13,7 @@
 
 #include "video_core/renderer_vulkan/declarations.h"
 #include "video_core/renderer_vulkan/vk_memory_manager.h"
+#include "video_core/renderer_vulkan/vk_resource_manager.h"
 
 namespace Vulkan {
 
diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp
new file mode 100644
index 000000000..51b0d38a6
--- /dev/null
+++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp
@@ -0,0 +1,475 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <algorithm>
+#include <array>
+#include <cstddef>
+#include <cstring>
+#include <memory>
+#include <variant>
+#include <vector>
+
+#include "common/alignment.h"
+#include "common/assert.h"
+#include "common/common_types.h"
+#include "core/core.h"
+#include "core/memory.h"
+#include "video_core/engines/maxwell_3d.h"
+#include "video_core/morton.h"
+#include "video_core/renderer_vulkan/declarations.h"
+#include "video_core/renderer_vulkan/maxwell_to_vk.h"
+#include "video_core/renderer_vulkan/vk_device.h"
+#include "video_core/renderer_vulkan/vk_memory_manager.h"
+#include "video_core/renderer_vulkan/vk_rasterizer.h"
+#include "video_core/renderer_vulkan/vk_staging_buffer_pool.h"
+#include "video_core/renderer_vulkan/vk_texture_cache.h"
+#include "video_core/surface.h"
+#include "video_core/textures/convert.h"
+
+namespace Vulkan {
+
+using VideoCore::MortonSwizzle;
+using VideoCore::MortonSwizzleMode;
+
+using Tegra::Texture::SwizzleSource;
+using VideoCore::Surface::PixelFormat;
+using VideoCore::Surface::SurfaceCompression;
+using VideoCore::Surface::SurfaceTarget;
+
+namespace {
+
+vk::ImageType SurfaceTargetToImage(SurfaceTarget target) {
+    switch (target) {
+    case SurfaceTarget::Texture1D:
+    case SurfaceTarget::Texture1DArray:
+        return vk::ImageType::e1D;
+    case SurfaceTarget::Texture2D:
+    case SurfaceTarget::Texture2DArray:
+    case SurfaceTarget::TextureCubemap:
+    case SurfaceTarget::TextureCubeArray:
+        return vk::ImageType::e2D;
+    case SurfaceTarget::Texture3D:
+        return vk::ImageType::e3D;
+    }
+    UNREACHABLE_MSG("Unknown texture target={}", static_cast<u32>(target));
+    return {};
+}
+
+vk::ImageAspectFlags PixelFormatToImageAspect(PixelFormat pixel_format) {
+    if (pixel_format < PixelFormat::MaxColorFormat) {
+        return vk::ImageAspectFlagBits::eColor;
+    } else if (pixel_format < PixelFormat::MaxDepthFormat) {
+        return vk::ImageAspectFlagBits::eDepth;
+    } else if (pixel_format < PixelFormat::MaxDepthStencilFormat) {
+        return vk::ImageAspectFlagBits::eDepth | vk::ImageAspectFlagBits::eStencil;
+    } else {
+        UNREACHABLE_MSG("Invalid pixel format={}", static_cast<u32>(pixel_format));
+        return vk::ImageAspectFlagBits::eColor;
+    }
+}
+
+vk::ImageViewType GetImageViewType(SurfaceTarget target) {
+    switch (target) {
+    case SurfaceTarget::Texture1D:
+        return vk::ImageViewType::e1D;
+    case SurfaceTarget::Texture2D:
+        return vk::ImageViewType::e2D;
+    case SurfaceTarget::Texture3D:
+        return vk::ImageViewType::e3D;
+    case SurfaceTarget::Texture1DArray:
+        return vk::ImageViewType::e1DArray;
+    case SurfaceTarget::Texture2DArray:
+        return vk::ImageViewType::e2DArray;
+    case SurfaceTarget::TextureCubemap:
+        return vk::ImageViewType::eCube;
+    case SurfaceTarget::TextureCubeArray:
+        return vk::ImageViewType::eCubeArray;
+    case SurfaceTarget::TextureBuffer:
+        break;
+    }
+    UNREACHABLE();
+    return {};
+}
+
+UniqueBuffer CreateBuffer(const VKDevice& device, const SurfaceParams& params) {
+    // TODO(Rodrigo): Move texture buffer creation to the buffer cache
+    const vk::BufferCreateInfo buffer_ci({}, params.GetHostSizeInBytes(),
+                                         vk::BufferUsageFlagBits::eUniformTexelBuffer |
+                                             vk::BufferUsageFlagBits::eTransferSrc |
+                                             vk::BufferUsageFlagBits::eTransferDst,
+                                         vk::SharingMode::eExclusive, 0, nullptr);
+    const auto dev = device.GetLogical();
+    const auto& dld = device.GetDispatchLoader();
+    return dev.createBufferUnique(buffer_ci, nullptr, dld);
+}
+
+vk::BufferViewCreateInfo GenerateBufferViewCreateInfo(const VKDevice& device,
+                                                      const SurfaceParams& params,
+                                                      vk::Buffer buffer) {
+    ASSERT(params.IsBuffer());
+
+    const auto format =
+        MaxwellToVK::SurfaceFormat(device, FormatType::Buffer, params.pixel_format).format;
+    return vk::BufferViewCreateInfo({}, buffer, format, 0, params.GetHostSizeInBytes());
+}
+
+vk::ImageCreateInfo GenerateImageCreateInfo(const VKDevice& device, const SurfaceParams& params) {
+    constexpr auto sample_count = vk::SampleCountFlagBits::e1;
+    constexpr auto tiling = vk::ImageTiling::eOptimal;
+
+    ASSERT(!params.IsBuffer());
+
+    const auto [format, attachable, storage] =
+        MaxwellToVK::SurfaceFormat(device, FormatType::Optimal, params.pixel_format);
+
+    auto image_usage = vk::ImageUsageFlagBits::eSampled | vk::ImageUsageFlagBits::eTransferDst |
+                       vk::ImageUsageFlagBits::eTransferSrc;
+    if (attachable) {
+        image_usage |= params.IsPixelFormatZeta() ? vk::ImageUsageFlagBits::eDepthStencilAttachment
+                                                  : vk::ImageUsageFlagBits::eColorAttachment;
+    }
+    if (storage) {
+        image_usage |= vk::ImageUsageFlagBits::eStorage;
+    }
+
+    vk::ImageCreateFlags flags;
+    vk::Extent3D extent;
+    switch (params.target) {
+    case SurfaceTarget::TextureCubemap:
+    case SurfaceTarget::TextureCubeArray:
+        flags |= vk::ImageCreateFlagBits::eCubeCompatible;
+        [[fallthrough]];
+    case SurfaceTarget::Texture1D:
+    case SurfaceTarget::Texture1DArray:
+    case SurfaceTarget::Texture2D:
+    case SurfaceTarget::Texture2DArray:
+        extent = vk::Extent3D(params.width, params.height, 1);
+        break;
+    case SurfaceTarget::Texture3D:
+        extent = vk::Extent3D(params.width, params.height, params.depth);
+        break;
+    case SurfaceTarget::TextureBuffer:
+        UNREACHABLE();
+    }
+
+    return vk::ImageCreateInfo(flags, SurfaceTargetToImage(params.target), format, extent,
+                               params.num_levels, static_cast<u32>(params.GetNumLayers()),
+                               sample_count, tiling, image_usage, vk::SharingMode::eExclusive, 0,
+                               nullptr, vk::ImageLayout::eUndefined);
+}
+
+} // Anonymous namespace
+
+CachedSurface::CachedSurface(Core::System& system, const VKDevice& device,
+                             VKResourceManager& resource_manager, VKMemoryManager& memory_manager,
+                             VKScheduler& scheduler, VKStagingBufferPool& staging_pool,
+                             GPUVAddr gpu_addr, const SurfaceParams& params)
+    : SurfaceBase<View>{gpu_addr, params}, system{system}, device{device},
+      resource_manager{resource_manager}, memory_manager{memory_manager}, scheduler{scheduler},
+      staging_pool{staging_pool} {
+    if (params.IsBuffer()) {
+        buffer = CreateBuffer(device, params);
+        commit = memory_manager.Commit(*buffer, false);
+
+        const auto buffer_view_ci = GenerateBufferViewCreateInfo(device, params, *buffer);
+        format = buffer_view_ci.format;
+
+        const auto dev = device.GetLogical();
+        const auto& dld = device.GetDispatchLoader();
+        buffer_view = dev.createBufferViewUnique(buffer_view_ci, nullptr, dld);
+    } else {
+        const auto image_ci = GenerateImageCreateInfo(device, params);
+        format = image_ci.format;
+
+        image.emplace(device, scheduler, image_ci, PixelFormatToImageAspect(params.pixel_format));
+        commit = memory_manager.Commit(image->GetHandle(), false);
+    }
+
+    // TODO(Rodrigo): Move this to a virtual function.
+    main_view = CreateViewInner(
+        ViewParams(params.target, 0, static_cast<u32>(params.GetNumLayers()), 0, params.num_levels),
+        true);
+}
+
+CachedSurface::~CachedSurface() = default;
+
+void CachedSurface::UploadTexture(const std::vector<u8>& staging_buffer) {
+    // To upload data we have to be outside of a renderpass
+    scheduler.RequestOutsideRenderPassOperationContext();
+
+    if (params.IsBuffer()) {
+        UploadBuffer(staging_buffer);
+    } else {
+        UploadImage(staging_buffer);
+    }
+}
+
+void CachedSurface::DownloadTexture(std::vector<u8>& staging_buffer) {
+    UNIMPLEMENTED_IF(params.IsBuffer());
+
+    if (params.pixel_format == VideoCore::Surface::PixelFormat::A1B5G5R5U) {
+        LOG_WARNING(Render_Vulkan, "A1B5G5R5 flushing is stubbed");
+    }
+
+    // We can't copy images to buffers inside a renderpass
+    scheduler.RequestOutsideRenderPassOperationContext();
+
+    FullTransition(vk::PipelineStageFlagBits::eTransfer, vk::AccessFlagBits::eTransferRead,
+                   vk::ImageLayout::eTransferSrcOptimal);
+
+    const auto& buffer = staging_pool.GetUnusedBuffer(host_memory_size, true);
+    // TODO(Rodrigo): Do this in a single copy
+    for (u32 level = 0; level < params.num_levels; ++level) {
+        scheduler.Record([image = image->GetHandle(), buffer = *buffer.handle,
+                          copy = GetBufferImageCopy(level)](auto cmdbuf, auto& dld) {
+            cmdbuf.copyImageToBuffer(image, vk::ImageLayout::eTransferSrcOptimal, buffer, {copy},
+                                     dld);
+        });
+    }
+    scheduler.Finish();
+
+    // TODO(Rodrigo): Use an intern buffer for staging buffers and avoid this unnecessary memcpy.
+    std::memcpy(staging_buffer.data(), buffer.commit->Map(host_memory_size), host_memory_size);
+}
+
+void CachedSurface::DecorateSurfaceName() {
+    // TODO(Rodrigo): Add name decorations
+}
+
+View CachedSurface::CreateView(const ViewParams& params) {
+    return CreateViewInner(params, false);
+}
+
+View CachedSurface::CreateViewInner(const ViewParams& params, bool is_proxy) {
+    // TODO(Rodrigo): Add name decorations
+    return views[params] = std::make_shared<CachedSurfaceView>(device, *this, params, is_proxy);
+}
+
+void CachedSurface::UploadBuffer(const std::vector<u8>& staging_buffer) {
+    const auto& src_buffer = staging_pool.GetUnusedBuffer(host_memory_size, true);
+    std::memcpy(src_buffer.commit->Map(host_memory_size), staging_buffer.data(), host_memory_size);
+
+    scheduler.Record([src_buffer = *src_buffer.handle, dst_buffer = *buffer,
+                      size = params.GetHostSizeInBytes()](auto cmdbuf, auto& dld) {
+        const vk::BufferCopy copy(0, 0, size);
+        cmdbuf.copyBuffer(src_buffer, dst_buffer, {copy}, dld);
+
+        cmdbuf.pipelineBarrier(
+            vk::PipelineStageFlagBits::eTransfer, vk::PipelineStageFlagBits::eVertexShader, {}, {},
+            {vk::BufferMemoryBarrier(vk::AccessFlagBits::eTransferWrite,
+                                     vk::AccessFlagBits::eShaderRead, 0, 0, dst_buffer, 0, size)},
+            {}, dld);
+    });
+}
+
+void CachedSurface::UploadImage(const std::vector<u8>& staging_buffer) {
+    const auto& src_buffer = staging_pool.GetUnusedBuffer(host_memory_size, true);
+    std::memcpy(src_buffer.commit->Map(host_memory_size), staging_buffer.data(), host_memory_size);
+
+    FullTransition(vk::PipelineStageFlagBits::eTransfer, vk::AccessFlagBits::eTransferWrite,
+                   vk::ImageLayout::eTransferDstOptimal);
+
+    for (u32 level = 0; level < params.num_levels; ++level) {
+        vk::BufferImageCopy copy = GetBufferImageCopy(level);
+        const auto& dld = device.GetDispatchLoader();
+        if (image->GetAspectMask() ==
+            (vk::ImageAspectFlagBits::eDepth | vk::ImageAspectFlagBits::eStencil)) {
+            vk::BufferImageCopy depth = copy;
+            vk::BufferImageCopy stencil = copy;
+            depth.imageSubresource.aspectMask = vk::ImageAspectFlagBits::eDepth;
+            stencil.imageSubresource.aspectMask = vk::ImageAspectFlagBits::eStencil;
+            scheduler.Record([buffer = *src_buffer.handle, image = image->GetHandle(), depth,
+                              stencil](auto cmdbuf, auto& dld) {
+                cmdbuf.copyBufferToImage(buffer, image, vk::ImageLayout::eTransferDstOptimal,
+                                         {depth, stencil}, dld);
+            });
+        } else {
+            scheduler.Record([buffer = *src_buffer.handle, image = image->GetHandle(),
+                              copy](auto cmdbuf, auto& dld) {
+                cmdbuf.copyBufferToImage(buffer, image, vk::ImageLayout::eTransferDstOptimal,
+                                         {copy}, dld);
+            });
+        }
+    }
+}
+
+vk::BufferImageCopy CachedSurface::GetBufferImageCopy(u32 level) const {
+    const u32 vk_depth = params.target == SurfaceTarget::Texture3D ? params.GetMipDepth(level) : 1;
+    const auto compression_type = params.GetCompressionType();
+    const std::size_t mip_offset = compression_type == SurfaceCompression::Converted
+                                       ? params.GetConvertedMipmapOffset(level)
+                                       : params.GetHostMipmapLevelOffset(level);
+
+    return vk::BufferImageCopy(
+        mip_offset, 0, 0,
+        {image->GetAspectMask(), level, 0, static_cast<u32>(params.GetNumLayers())}, {0, 0, 0},
+        {params.GetMipWidth(level), params.GetMipHeight(level), vk_depth});
+}
+
+vk::ImageSubresourceRange CachedSurface::GetImageSubresourceRange() const {
+    return {image->GetAspectMask(), 0, params.num_levels, 0,
+            static_cast<u32>(params.GetNumLayers())};
+}
+
+CachedSurfaceView::CachedSurfaceView(const VKDevice& device, CachedSurface& surface,
+                                     const ViewParams& params, bool is_proxy)
+    : VideoCommon::ViewBase{params}, params{surface.GetSurfaceParams()},
+      image{surface.GetImageHandle()}, buffer_view{surface.GetBufferViewHandle()},
+      aspect_mask{surface.GetAspectMask()}, device{device}, surface{surface},
+      base_layer{params.base_layer}, num_layers{params.num_layers}, base_level{params.base_level},
+      num_levels{params.num_levels}, image_view_type{image ? GetImageViewType(params.target)
+                                                           : vk::ImageViewType{}} {}
+
+CachedSurfaceView::~CachedSurfaceView() = default;
+
+vk::ImageView CachedSurfaceView::GetHandle(SwizzleSource x_source, SwizzleSource y_source,
+                                           SwizzleSource z_source, SwizzleSource w_source) {
+    const u32 swizzle = EncodeSwizzle(x_source, y_source, z_source, w_source);
+    if (last_image_view && last_swizzle == swizzle) {
+        return last_image_view;
+    }
+    last_swizzle = swizzle;
+
+    const auto [entry, is_cache_miss] = view_cache.try_emplace(swizzle);
+    auto& image_view = entry->second;
+    if (!is_cache_miss) {
+        return last_image_view = *image_view;
+    }
+
+    auto swizzle_x = MaxwellToVK::SwizzleSource(x_source);
+    auto swizzle_y = MaxwellToVK::SwizzleSource(y_source);
+    auto swizzle_z = MaxwellToVK::SwizzleSource(z_source);
+    auto swizzle_w = MaxwellToVK::SwizzleSource(w_source);
+
+    if (params.pixel_format == VideoCore::Surface::PixelFormat::A1B5G5R5U) {
+        // A1B5G5R5 is implemented as A1R5G5B5, we have to change the swizzle here.
+        std::swap(swizzle_x, swizzle_z);
+    }
+
+    // Games can sample depth or stencil values on textures. This is decided by the swizzle value on
+    // hardware. To emulate this on Vulkan we specify it in the aspect.
+    vk::ImageAspectFlags aspect = aspect_mask;
+    if (aspect == (vk::ImageAspectFlagBits::eDepth | vk::ImageAspectFlagBits::eStencil)) {
+        UNIMPLEMENTED_IF(x_source != SwizzleSource::R && x_source != SwizzleSource::G);
+        const bool is_first = x_source == SwizzleSource::R;
+        switch (params.pixel_format) {
+        case VideoCore::Surface::PixelFormat::Z24S8:
+        case VideoCore::Surface::PixelFormat::Z32FS8:
+            aspect = is_first ? vk::ImageAspectFlagBits::eDepth : vk::ImageAspectFlagBits::eStencil;
+            break;
+        case VideoCore::Surface::PixelFormat::S8Z24:
+            aspect = is_first ? vk::ImageAspectFlagBits::eStencil : vk::ImageAspectFlagBits::eDepth;
+            break;
+        default:
+            aspect = vk::ImageAspectFlagBits::eDepth;
+            UNIMPLEMENTED();
+        }
+
+        // Vulkan doesn't seem to understand swizzling of a depth stencil image, use identity
+        swizzle_x = vk::ComponentSwizzle::eR;
+        swizzle_y = vk::ComponentSwizzle::eG;
+        swizzle_z = vk::ComponentSwizzle::eB;
+        swizzle_w = vk::ComponentSwizzle::eA;
+    }
+
+    const vk::ImageViewCreateInfo image_view_ci(
+        {}, surface.GetImageHandle(), image_view_type, surface.GetImage().GetFormat(),
+        {swizzle_x, swizzle_y, swizzle_z, swizzle_w},
+        {aspect, base_level, num_levels, base_layer, num_layers});
+
+    const auto dev = device.GetLogical();
+    image_view = dev.createImageViewUnique(image_view_ci, nullptr, device.GetDispatchLoader());
+    return last_image_view = *image_view;
+}
+
+VKTextureCache::VKTextureCache(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
+                               const VKDevice& device, VKResourceManager& resource_manager,
+                               VKMemoryManager& memory_manager, VKScheduler& scheduler,
+                               VKStagingBufferPool& staging_pool)
+    : TextureCache(system, rasterizer), device{device}, resource_manager{resource_manager},
+      memory_manager{memory_manager}, scheduler{scheduler}, staging_pool{staging_pool} {}
+
+VKTextureCache::~VKTextureCache() = default;
+
+Surface VKTextureCache::CreateSurface(GPUVAddr gpu_addr, const SurfaceParams& params) {
+    return std::make_shared<CachedSurface>(system, device, resource_manager, memory_manager,
+                                           scheduler, staging_pool, gpu_addr, params);
+}
+
+void VKTextureCache::ImageCopy(Surface& src_surface, Surface& dst_surface,
+                               const VideoCommon::CopyParams& copy_params) {
+    const bool src_3d = src_surface->GetSurfaceParams().target == SurfaceTarget::Texture3D;
+    const bool dst_3d = dst_surface->GetSurfaceParams().target == SurfaceTarget::Texture3D;
+    UNIMPLEMENTED_IF(src_3d);
+
+    // The texture cache handles depth in OpenGL terms, we have to handle it as subresource and
+    // dimension respectively.
+    const u32 dst_base_layer = dst_3d ? 0 : copy_params.dest_z;
+    const u32 dst_offset_z = dst_3d ? copy_params.dest_z : 0;
+
+    const u32 extent_z = dst_3d ? copy_params.depth : 1;
+    const u32 num_layers = dst_3d ? 1 : copy_params.depth;
+
+    // We can't copy inside a renderpass
+    scheduler.RequestOutsideRenderPassOperationContext();
+
+    src_surface->Transition(copy_params.source_z, copy_params.depth, copy_params.source_level, 1,
+                            vk::PipelineStageFlagBits::eTransfer, vk::AccessFlagBits::eTransferRead,
+                            vk::ImageLayout::eTransferSrcOptimal);
+    dst_surface->Transition(
+        dst_base_layer, num_layers, copy_params.dest_level, 1, vk::PipelineStageFlagBits::eTransfer,
+        vk::AccessFlagBits::eTransferWrite, vk::ImageLayout::eTransferDstOptimal);
+
+    const auto& dld{device.GetDispatchLoader()};
+    const vk::ImageSubresourceLayers src_subresource(
+        src_surface->GetAspectMask(), copy_params.source_level, copy_params.source_z, num_layers);
+    const vk::ImageSubresourceLayers dst_subresource(
+        dst_surface->GetAspectMask(), copy_params.dest_level, dst_base_layer, num_layers);
+    const vk::Offset3D src_offset(copy_params.source_x, copy_params.source_y, 0);
+    const vk::Offset3D dst_offset(copy_params.dest_x, copy_params.dest_y, dst_offset_z);
+    const vk::Extent3D extent(copy_params.width, copy_params.height, extent_z);
+    const vk::ImageCopy copy(src_subresource, src_offset, dst_subresource, dst_offset, extent);
+    const vk::Image src_image = src_surface->GetImageHandle();
+    const vk::Image dst_image = dst_surface->GetImageHandle();
+    scheduler.Record([src_image, dst_image, copy](auto cmdbuf, auto& dld) {
+        cmdbuf.copyImage(src_image, vk::ImageLayout::eTransferSrcOptimal, dst_image,
+                         vk::ImageLayout::eTransferDstOptimal, {copy}, dld);
+    });
+}
+
+void VKTextureCache::ImageBlit(View& src_view, View& dst_view,
+                               const Tegra::Engines::Fermi2D::Config& copy_config) {
+    // We can't blit inside a renderpass
+    scheduler.RequestOutsideRenderPassOperationContext();
+
+    src_view->Transition(vk::ImageLayout::eTransferSrcOptimal, vk::PipelineStageFlagBits::eTransfer,
+                         vk::AccessFlagBits::eTransferRead);
+    dst_view->Transition(vk::ImageLayout::eTransferDstOptimal, vk::PipelineStageFlagBits::eTransfer,
+                         vk::AccessFlagBits::eTransferWrite);
+
+    const auto& cfg = copy_config;
+    const auto src_top_left = vk::Offset3D(cfg.src_rect.left, cfg.src_rect.top, 0);
+    const auto src_bot_right = vk::Offset3D(cfg.src_rect.right, cfg.src_rect.bottom, 1);
+    const auto dst_top_left = vk::Offset3D(cfg.dst_rect.left, cfg.dst_rect.top, 0);
+    const auto dst_bot_right = vk::Offset3D(cfg.dst_rect.right, cfg.dst_rect.bottom, 1);
+    const vk::ImageBlit blit(src_view->GetImageSubresourceLayers(), {src_top_left, src_bot_right},
+                             dst_view->GetImageSubresourceLayers(), {dst_top_left, dst_bot_right});
+    const bool is_linear = copy_config.filter == Tegra::Engines::Fermi2D::Filter::Linear;
+
+    const auto& dld{device.GetDispatchLoader()};
+    scheduler.Record([src_image = src_view->GetImage(), dst_image = dst_view->GetImage(), blit,
+                      is_linear](auto cmdbuf, auto& dld) {
+        cmdbuf.blitImage(src_image, vk::ImageLayout::eTransferSrcOptimal, dst_image,
+                         vk::ImageLayout::eTransferDstOptimal, {blit},
+                         is_linear ? vk::Filter::eLinear : vk::Filter::eNearest, dld);
+    });
+}
+
+void VKTextureCache::BufferCopy(Surface& src_surface, Surface& dst_surface) {
+    // Currently unimplemented. PBO copies should be dropped and we should use a render pass to
+    // convert from color to depth and viceversa.
+    LOG_WARNING(Render_Vulkan, "Unimplemented");
+}
+
+} // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.h b/src/video_core/renderer_vulkan/vk_texture_cache.h
new file mode 100644
index 000000000..d3edbe80c
--- /dev/null
+++ b/src/video_core/renderer_vulkan/vk_texture_cache.h
@@ -0,0 +1,239 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <memory>
+#include <unordered_map>
+
+#include "common/assert.h"
+#include "common/common_types.h"
+#include "common/logging/log.h"
+#include "common/math_util.h"
+#include "video_core/gpu.h"
+#include "video_core/rasterizer_cache.h"
+#include "video_core/renderer_vulkan/declarations.h"
+#include "video_core/renderer_vulkan/vk_image.h"
+#include "video_core/renderer_vulkan/vk_memory_manager.h"
+#include "video_core/renderer_vulkan/vk_scheduler.h"
+#include "video_core/texture_cache/surface_base.h"
+#include "video_core/texture_cache/texture_cache.h"
+#include "video_core/textures/decoders.h"
+
+namespace Core {
+class System;
+}
+
+namespace VideoCore {
+class RasterizerInterface;
+}
+
+namespace Vulkan {
+
+class RasterizerVulkan;
+class VKDevice;
+class VKResourceManager;
+class VKScheduler;
+class VKStagingBufferPool;
+
+class CachedSurfaceView;
+class CachedSurface;
+
+using Surface = std::shared_ptr<CachedSurface>;
+using View = std::shared_ptr<CachedSurfaceView>;
+using TextureCacheBase = VideoCommon::TextureCache<Surface, View>;
+
+using VideoCommon::SurfaceParams;
+using VideoCommon::ViewParams;
+
+class CachedSurface final : public VideoCommon::SurfaceBase<View> {
+    friend CachedSurfaceView;
+
+public:
+    explicit CachedSurface(Core::System& system, const VKDevice& device,
+                           VKResourceManager& resource_manager, VKMemoryManager& memory_manager,
+                           VKScheduler& scheduler, VKStagingBufferPool& staging_pool,
+                           GPUVAddr gpu_addr, const SurfaceParams& params);
+    ~CachedSurface();
+
+    void UploadTexture(const std::vector<u8>& staging_buffer) override;
+    void DownloadTexture(std::vector<u8>& staging_buffer) override;
+
+    void FullTransition(vk::PipelineStageFlags new_stage_mask, vk::AccessFlags new_access,
+                        vk::ImageLayout new_layout) {
+        image->Transition(0, static_cast<u32>(params.GetNumLayers()), 0, params.num_levels,
+                          new_stage_mask, new_access, new_layout);
+    }
+
+    void Transition(u32 base_layer, u32 num_layers, u32 base_level, u32 num_levels,
+                    vk::PipelineStageFlags new_stage_mask, vk::AccessFlags new_access,
+                    vk::ImageLayout new_layout) {
+        image->Transition(base_layer, num_layers, base_level, num_levels, new_stage_mask,
+                          new_access, new_layout);
+    }
+
+    VKImage& GetImage() {
+        return *image;
+    }
+
+    const VKImage& GetImage() const {
+        return *image;
+    }
+
+    vk::Image GetImageHandle() const {
+        return image->GetHandle();
+    }
+
+    vk::ImageAspectFlags GetAspectMask() const {
+        return image->GetAspectMask();
+    }
+
+    vk::BufferView GetBufferViewHandle() const {
+        return *buffer_view;
+    }
+
+protected:
+    void DecorateSurfaceName();
+
+    View CreateView(const ViewParams& params) override;
+    View CreateViewInner(const ViewParams& params, bool is_proxy);
+
+private:
+    void UploadBuffer(const std::vector<u8>& staging_buffer);
+
+    void UploadImage(const std::vector<u8>& staging_buffer);
+
+    vk::BufferImageCopy GetBufferImageCopy(u32 level) const;
+
+    vk::ImageSubresourceRange GetImageSubresourceRange() const;
+
+    Core::System& system;
+    const VKDevice& device;
+    VKResourceManager& resource_manager;
+    VKMemoryManager& memory_manager;
+    VKScheduler& scheduler;
+    VKStagingBufferPool& staging_pool;
+
+    std::optional<VKImage> image;
+    UniqueBuffer buffer;
+    UniqueBufferView buffer_view;
+    VKMemoryCommit commit;
+
+    vk::Format format;
+};
+
+class CachedSurfaceView final : public VideoCommon::ViewBase {
+public:
+    explicit CachedSurfaceView(const VKDevice& device, CachedSurface& surface,
+                               const ViewParams& params, bool is_proxy);
+    ~CachedSurfaceView();
+
+    vk::ImageView GetHandle(Tegra::Texture::SwizzleSource x_source,
+                            Tegra::Texture::SwizzleSource y_source,
+                            Tegra::Texture::SwizzleSource z_source,
+                            Tegra::Texture::SwizzleSource w_source);
+
+    bool IsSameSurface(const CachedSurfaceView& rhs) const {
+        return &surface == &rhs.surface;
+    }
+
+    vk::ImageView GetHandle() {
+        return GetHandle(Tegra::Texture::SwizzleSource::R, Tegra::Texture::SwizzleSource::G,
+                         Tegra::Texture::SwizzleSource::B, Tegra::Texture::SwizzleSource::A);
+    }
+
+    u32 GetWidth() const {
+        return params.GetMipWidth(base_level);
+    }
+
+    u32 GetHeight() const {
+        return params.GetMipHeight(base_level);
+    }
+
+    bool IsBufferView() const {
+        return buffer_view;
+    }
+
+    vk::Image GetImage() const {
+        return image;
+    }
+
+    vk::BufferView GetBufferView() const {
+        return buffer_view;
+    }
+
+    vk::ImageSubresourceRange GetImageSubresourceRange() const {
+        return {aspect_mask, base_level, num_levels, base_layer, num_layers};
+    }
+
+    vk::ImageSubresourceLayers GetImageSubresourceLayers() const {
+        return {surface.GetAspectMask(), base_level, base_layer, num_layers};
+    }
+
+    void Transition(vk::ImageLayout new_layout, vk::PipelineStageFlags new_stage_mask,
+                    vk::AccessFlags new_access) const {
+        surface.Transition(base_layer, num_layers, base_level, num_levels, new_stage_mask,
+                           new_access, new_layout);
+    }
+
+    void MarkAsModified(u64 tick) {
+        surface.MarkAsModified(true, tick);
+    }
+
+private:
+    static u32 EncodeSwizzle(Tegra::Texture::SwizzleSource x_source,
+                             Tegra::Texture::SwizzleSource y_source,
+                             Tegra::Texture::SwizzleSource z_source,
+                             Tegra::Texture::SwizzleSource w_source) {
+        return (static_cast<u32>(x_source) << 24) | (static_cast<u32>(y_source) << 16) |
+               (static_cast<u32>(z_source) << 8) | static_cast<u32>(w_source);
+    }
+
+    // Store a copy of these values to avoid double dereference when reading them
+    const SurfaceParams params;
+    const vk::Image image;
+    const vk::BufferView buffer_view;
+    const vk::ImageAspectFlags aspect_mask;
+
+    const VKDevice& device;
+    CachedSurface& surface;
+    const u32 base_layer;
+    const u32 num_layers;
+    const u32 base_level;
+    const u32 num_levels;
+    const vk::ImageViewType image_view_type;
+
+    vk::ImageView last_image_view;
+    u32 last_swizzle{};
+
+    std::unordered_map<u32, UniqueImageView> view_cache;
+};
+
+class VKTextureCache final : public TextureCacheBase {
+public:
+    explicit VKTextureCache(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
+                            const VKDevice& device, VKResourceManager& resource_manager,
+                            VKMemoryManager& memory_manager, VKScheduler& scheduler,
+                            VKStagingBufferPool& staging_pool);
+    ~VKTextureCache();
+
+private:
+    Surface CreateSurface(GPUVAddr gpu_addr, const SurfaceParams& params) override;
+
+    void ImageCopy(Surface& src_surface, Surface& dst_surface,
+                   const VideoCommon::CopyParams& copy_params) override;
+
+    void ImageBlit(View& src_view, View& dst_view,
+                   const Tegra::Engines::Fermi2D::Config& copy_config) override;
+
+    void BufferCopy(Surface& src_surface, Surface& dst_surface) override;
+
+    const VKDevice& device;
+    VKResourceManager& resource_manager;
+    VKMemoryManager& memory_manager;
+    VKScheduler& scheduler;
+    VKStagingBufferPool& staging_pool;
+};
+
+} // namespace Vulkan
diff --git a/src/video_core/shader/control_flow.cpp b/src/video_core/shader/control_flow.cpp
index b427ac873..0229733b6 100644
--- a/src/video_core/shader/control_flow.cpp
+++ b/src/video_core/shader/control_flow.cpp
@@ -65,7 +65,7 @@ struct BlockInfo {
 
 struct CFGRebuildState {
     explicit CFGRebuildState(const ProgramCode& program_code, u32 start, ConstBufferLocker& locker)
-        : program_code{program_code}, start{start}, locker{locker} {}
+        : program_code{program_code}, locker{locker}, start{start} {}
 
     const ProgramCode& program_code;
     ConstBufferLocker& locker;
diff --git a/src/video_core/shader/decode/memory.cpp b/src/video_core/shader/decode/memory.cpp
index c934d0719..7591a715f 100644
--- a/src/video_core/shader/decode/memory.cpp
+++ b/src/video_core/shader/decode/memory.cpp
@@ -6,6 +6,7 @@
 #include <vector>
 #include <fmt/format.h>
 
+#include "common/alignment.h"
 #include "common/assert.h"
 #include "common/common_types.h"
 #include "common/logging/log.h"
@@ -15,6 +16,8 @@
 
 namespace VideoCommon::Shader {
 
+using Tegra::Shader::AtomicOp;
+using Tegra::Shader::AtomicType;
 using Tegra::Shader::Attribute;
 using Tegra::Shader::Instruction;
 using Tegra::Shader::OpCode;
@@ -22,34 +25,39 @@ using Tegra::Shader::Register;
 
 namespace {
 
-u32 GetLdgMemorySize(Tegra::Shader::UniformType uniform_type) {
+bool IsUnaligned(Tegra::Shader::UniformType uniform_type) {
+    return uniform_type == Tegra::Shader::UniformType::UnsignedByte ||
+           uniform_type == Tegra::Shader::UniformType::UnsignedShort;
+}
+
+u32 GetUnalignedMask(Tegra::Shader::UniformType uniform_type) {
     switch (uniform_type) {
     case Tegra::Shader::UniformType::UnsignedByte:
-    case Tegra::Shader::UniformType::Single:
-        return 1;
-    case Tegra::Shader::UniformType::Double:
-        return 2;
-    case Tegra::Shader::UniformType::Quad:
-    case Tegra::Shader::UniformType::UnsignedQuad:
-        return 4;
+        return 0b11;
+    case Tegra::Shader::UniformType::UnsignedShort:
+        return 0b10;
     default:
-        UNIMPLEMENTED_MSG("Unimplemented size={}!", static_cast<u32>(uniform_type));
-        return 1;
+        UNREACHABLE();
+        return 0;
     }
 }
 
-u32 GetStgMemorySize(Tegra::Shader::UniformType uniform_type) {
+u32 GetMemorySize(Tegra::Shader::UniformType uniform_type) {
     switch (uniform_type) {
+    case Tegra::Shader::UniformType::UnsignedByte:
+        return 8;
+    case Tegra::Shader::UniformType::UnsignedShort:
+        return 16;
     case Tegra::Shader::UniformType::Single:
-        return 1;
+        return 32;
     case Tegra::Shader::UniformType::Double:
-        return 2;
+        return 64;
     case Tegra::Shader::UniformType::Quad:
     case Tegra::Shader::UniformType::UnsignedQuad:
-        return 4;
+        return 128;
     default:
         UNIMPLEMENTED_MSG("Unimplemented size={}!", static_cast<u32>(uniform_type));
-        return 1;
+        return 32;
     }
 }
 
@@ -184,9 +192,10 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
         }();
 
         const auto [real_address_base, base_address, descriptor] =
-            TrackGlobalMemory(bb, instr, false);
+            TrackGlobalMemory(bb, instr, true, false);
 
-        const u32 count = GetLdgMemorySize(type);
+        const u32 size = GetMemorySize(type);
+        const u32 count = Common::AlignUp(size, 32) / 32;
         if (!real_address_base || !base_address) {
             // Tracking failed, load zeroes.
             for (u32 i = 0; i < count; ++i) {
@@ -200,14 +209,15 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
             const Node real_address = Operation(OperationCode::UAdd, real_address_base, it_offset);
             Node gmem = MakeNode<GmemNode>(real_address, base_address, descriptor);
 
-            if (type == Tegra::Shader::UniformType::UnsignedByte) {
-                // To handle unaligned loads get the byte used to dereferenced global memory
-                // and extract that byte from the loaded uint32.
-                Node byte = Operation(OperationCode::UBitwiseAnd, real_address, Immediate(3));
-                byte = Operation(OperationCode::ULogicalShiftLeft, std::move(byte), Immediate(3));
+            // To handle unaligned loads get the bytes used to dereference global memory and extract
+            // those bytes from the loaded u32.
+            if (IsUnaligned(type)) {
+                Node mask = Immediate(GetUnalignedMask(type));
+                Node offset = Operation(OperationCode::UBitwiseAnd, real_address, std::move(mask));
+                offset = Operation(OperationCode::ULogicalShiftLeft, offset, Immediate(3));
 
-                gmem = Operation(OperationCode::UBitfieldExtract, std::move(gmem), std::move(byte),
-                                 Immediate(8));
+                gmem = Operation(OperationCode::UBitfieldExtract, std::move(gmem),
+                                 std::move(offset), Immediate(size));
             }
 
             SetTemporary(bb, i, gmem);
@@ -295,23 +305,53 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
             }
         }();
 
+        // For unaligned reads we have to read memory too.
+        const bool is_read = IsUnaligned(type);
         const auto [real_address_base, base_address, descriptor] =
-            TrackGlobalMemory(bb, instr, true);
+            TrackGlobalMemory(bb, instr, is_read, true);
         if (!real_address_base || !base_address) {
             // Tracking failed, skip the store.
             break;
         }
 
-        const u32 count = GetStgMemorySize(type);
+        const u32 size = GetMemorySize(type);
+        const u32 count = Common::AlignUp(size, 32) / 32;
         for (u32 i = 0; i < count; ++i) {
             const Node it_offset = Immediate(i * 4);
             const Node real_address = Operation(OperationCode::UAdd, real_address_base, it_offset);
             const Node gmem = MakeNode<GmemNode>(real_address, base_address, descriptor);
-            const Node value = GetRegister(instr.gpr0.Value() + i);
+            Node value = GetRegister(instr.gpr0.Value() + i);
+
+            if (IsUnaligned(type)) {
+                Node mask = Immediate(GetUnalignedMask(type));
+                Node offset = Operation(OperationCode::UBitwiseAnd, real_address, std::move(mask));
+                offset = Operation(OperationCode::ULogicalShiftLeft, offset, Immediate(3));
+
+                value = Operation(OperationCode::UBitfieldInsert, gmem, std::move(value), offset,
+                                  Immediate(size));
+            }
+
             bb.push_back(Operation(OperationCode::Assign, gmem, value));
         }
         break;
     }
+    case OpCode::Id::ATOMS: {
+        UNIMPLEMENTED_IF_MSG(instr.atoms.operation != AtomicOp::Add, "operation={}",
+                             static_cast<int>(instr.atoms.operation.Value()));
+        UNIMPLEMENTED_IF_MSG(instr.atoms.type != AtomicType::U32, "type={}",
+                             static_cast<int>(instr.atoms.type.Value()));
+
+        const s32 offset = instr.atoms.GetImmediateOffset();
+        Node address = GetRegister(instr.gpr8);
+        address = Operation(OperationCode::IAdd, std::move(address), Immediate(offset));
+
+        Node memory = GetSharedMemory(std::move(address));
+        Node data = GetRegister(instr.gpr20);
+
+        Node value = Operation(OperationCode::UAtomicAdd, std::move(memory), std::move(data));
+        SetRegister(bb, instr.gpr0, std::move(value));
+        break;
+    }
     case OpCode::Id::AL2P: {
         // Ignore al2p.direction since we don't care about it.
 
@@ -336,7 +376,7 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
 
 std::tuple<Node, Node, GlobalMemoryBase> ShaderIR::TrackGlobalMemory(NodeBlock& bb,
                                                                      Instruction instr,
-                                                                     bool is_write) {
+                                                                     bool is_read, bool is_write) {
     const auto addr_register{GetRegister(instr.gmem.gpr)};
     const auto immediate_offset{static_cast<u32>(instr.gmem.offset)};
 
@@ -351,11 +391,8 @@ std::tuple<Node, Node, GlobalMemoryBase> ShaderIR::TrackGlobalMemory(NodeBlock&
     const GlobalMemoryBase descriptor{index, offset};
     const auto& [entry, is_new] = used_global_memory.try_emplace(descriptor);
     auto& usage = entry->second;
-    if (is_write) {
-        usage.is_written = true;
-    } else {
-        usage.is_read = true;
-    }
+    usage.is_written |= is_write;
+    usage.is_read |= is_read;
 
     const auto real_address =
         Operation(OperationCode::UAdd, NO_PRECISE, Immediate(immediate_offset), addr_register);
diff --git a/src/video_core/shader/decode/texture.cpp b/src/video_core/shader/decode/texture.cpp
index 4b14cdf58..cd984f763 100644
--- a/src/video_core/shader/decode/texture.cpp
+++ b/src/video_core/shader/decode/texture.cpp
@@ -794,14 +794,10 @@ std::tuple<std::size_t, std::size_t> ShaderIR::ValidateAndGetCoordinateElement(
 
 std::vector<Node> ShaderIR::GetAoffiCoordinates(Node aoffi_reg, std::size_t coord_count,
                                                 bool is_tld4) {
-    const auto [coord_offsets, size, wrap_value,
-                diff_value] = [is_tld4]() -> std::tuple<std::array<u32, 3>, u32, s32, s32> {
-        if (is_tld4) {
-            return {{0, 8, 16}, 6, 32, 64};
-        } else {
-            return {{0, 4, 8}, 4, 8, 16};
-        }
-    }();
+    const std::array coord_offsets = is_tld4 ? std::array{0U, 8U, 16U} : std::array{0U, 4U, 8U};
+    const u32 size = is_tld4 ? 6 : 4;
+    const s32 wrap_value = is_tld4 ? 32 : 8;
+    const s32 diff_value = is_tld4 ? 64 : 16;
     const u32 mask = (1U << size) - 1;
 
     std::vector<Node> aoffi;
@@ -814,7 +810,7 @@ std::vector<Node> ShaderIR::GetAoffiCoordinates(Node aoffi_reg, std::size_t coor
         LOG_WARNING(HW_GPU,
                     "AOFFI constant folding failed, some hardware might have graphical issues");
         for (std::size_t coord = 0; coord < coord_count; ++coord) {
-            const Node value = BitfieldExtract(aoffi_reg, coord_offsets.at(coord), size);
+            const Node value = BitfieldExtract(aoffi_reg, coord_offsets[coord], size);
             const Node condition =
                 Operation(OperationCode::LogicalIGreaterEqual, value, Immediate(wrap_value));
             const Node negative = Operation(OperationCode::IAdd, value, Immediate(-diff_value));
@@ -824,7 +820,7 @@ std::vector<Node> ShaderIR::GetAoffiCoordinates(Node aoffi_reg, std::size_t coor
     }
 
     for (std::size_t coord = 0; coord < coord_count; ++coord) {
-        s32 value = (*aoffi_immediate >> coord_offsets.at(coord)) & mask;
+        s32 value = (*aoffi_immediate >> coord_offsets[coord]) & mask;
         if (value >= wrap_value) {
             value -= diff_value;
         }
diff --git a/src/video_core/shader/node.h b/src/video_core/shader/node.h
index 4e155542a..075c7d07c 100644
--- a/src/video_core/shader/node.h
+++ b/src/video_core/shader/node.h
@@ -162,6 +162,8 @@ enum class OperationCode {
     AtomicImageXor,      /// (MetaImage, int[N] coords) -> void
     AtomicImageExchange, /// (MetaImage, int[N] coords) -> void
 
+    UAtomicAdd, /// (smem, uint) -> uint
+
     Branch,         /// (uint branch_target) -> void
     BranchIndirect, /// (uint branch_target) -> void
     PushFlowStack,  /// (uint branch_target) -> void
diff --git a/src/video_core/shader/shader_ir.h b/src/video_core/shader/shader_ir.h
index aacd0a0da..ba1db4c11 100644
--- a/src/video_core/shader/shader_ir.h
+++ b/src/video_core/shader/shader_ir.h
@@ -394,7 +394,7 @@ private:
 
     std::tuple<Node, Node, GlobalMemoryBase> TrackGlobalMemory(NodeBlock& bb,
                                                                Tegra::Shader::Instruction instr,
-                                                               bool is_write);
+                                                               bool is_read, bool is_write);
 
     /// Register new amending code and obtain the reference id.
     std::size_t DeclareAmend(Node new_amend);
diff --git a/src/video_core/texture_cache/format_lookup_table.cpp b/src/video_core/texture_cache/format_lookup_table.cpp
index 271e67533..81fb9f633 100644
--- a/src/video_core/texture_cache/format_lookup_table.cpp
+++ b/src/video_core/texture_cache/format_lookup_table.cpp
@@ -95,7 +95,7 @@ constexpr std::array<Table, 74> DefinitionTable = {{
     {TextureFormat::ZF32, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::Z32F},
     {TextureFormat::Z16, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::Z16},
     {TextureFormat::S8Z24, C, UINT, UNORM, UNORM, UNORM, PixelFormat::S8Z24},
-    {TextureFormat::ZF32_X24S8, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::Z32FS8},
+    {TextureFormat::ZF32_X24S8, C, FLOAT, UINT, UNORM, UNORM, PixelFormat::Z32FS8},
 
     {TextureFormat::DXT1, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::DXT1},
     {TextureFormat::DXT1, S, UNORM, UNORM, UNORM, UNORM, PixelFormat::DXT1_SRGB},
diff --git a/src/video_core/texture_cache/surface_params.h b/src/video_core/texture_cache/surface_params.h
index 992b5c022..9256fd6d9 100644
--- a/src/video_core/texture_cache/surface_params.h
+++ b/src/video_core/texture_cache/surface_params.h
@@ -209,6 +209,11 @@ public:
         return target == VideoCore::Surface::SurfaceTarget::TextureBuffer;
     }
 
+    /// Returns the number of layers in the surface.
+    std::size_t GetNumLayers() const {
+        return is_layered ? depth : 1;
+    }
+
     /// Returns the debug name of the texture for use in graphic debuggers.
     std::string TargetName() const;
 
@@ -287,10 +292,6 @@ private:
     /// Returns the size of a layer
     std::size_t GetLayerSize(bool as_host_size, bool uncompressed) const;
 
-    std::size_t GetNumLayers() const {
-        return is_layered ? depth : 1;
-    }
-
     /// Returns true if these parameters are from a layered surface.
     bool IsLayered() const;
 };
diff --git a/src/yuzu/configuration/configure_hotkeys.cpp b/src/yuzu/configuration/configure_hotkeys.cpp
index 3ea0b8d67..fa9052136 100644
--- a/src/yuzu/configuration/configure_hotkeys.cpp
+++ b/src/yuzu/configuration/configure_hotkeys.cpp
@@ -48,6 +48,7 @@ void ConfigureHotkeys::Populate(const HotkeyRegistry& registry) {
     }
 
     ui->hotkey_list->expandAll();
+    ui->hotkey_list->resizeColumnToContents(0);
 }
 
 void ConfigureHotkeys::changeEvent(QEvent* event) {
diff --git a/src/yuzu/main.ui b/src/yuzu/main.ui
index 581a10ddc..a2c9e4547 100644
--- a/src/yuzu/main.ui
+++ b/src/yuzu/main.ui
@@ -15,7 +15,7 @@
   </property>
   <property name="windowIcon">
    <iconset>
-    <normaloff>src/pcafe/res/icon3_64x64.ico</normaloff>src/pcafe/res/icon3_64x64.ico</iconset>
+    <normaloff>../dist/yuzu.ico</normaloff>../dist/yuzu.ico</iconset>
   </property>
   <property name="tabShape">
    <enum>QTabWidget::Rounded</enum>