7 files changed, 67 insertions, 43 deletions
diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h
index de971041f..9e6b87960 100644
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@@ -596,7 +596,7 @@ void BufferCache<P>::PopAsyncFlushes() {
             runtime.CopyBuffer(download_staging.buffer, slot_buffers[buffer_id], copies);
         }
         runtime.Finish();
-        for (const auto [copy, buffer_id] : downloads) {
+        for (const auto& [copy, buffer_id] : downloads) {
             const Buffer& buffer = slot_buffers[buffer_id];
             const VAddr cpu_addr = buffer.CpuAddr() + copy.src_offset;
             // Undo the modified offset
@@ -606,7 +606,7 @@ void BufferCache<P>::PopAsyncFlushes() {
         }
     } else {
         const std::span<u8> immediate_buffer = ImmediateBuffer(largest_copy);
-        for (const auto [copy, buffer_id] : downloads) {
+        for (const auto& [copy, buffer_id] : downloads) {
             Buffer& buffer = slot_buffers[buffer_id];
             buffer.ImmediateDownload(copy.src_offset, immediate_buffer.subspan(0, copy.size));
             const VAddr cpu_addr = buffer.CpuAddr() + copy.src_offset;
diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp
index 37f7b24e1..35cc561be 100644
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -104,7 +104,13 @@ void GPU::WaitFence(u32 syncpoint_id, u32 value) {
     }
     MICROPROFILE_SCOPE(GPU_wait);
     std::unique_lock lock{sync_mutex};
-    sync_cv.wait(lock, [=, this] { return syncpoints.at(syncpoint_id).load() >= value; });
+    sync_cv.wait(lock, [=, this] {
+        if (shutting_down.load(std::memory_order_relaxed)) {
+            // We're shutting down, ensure no threads continue to wait for the next syncpoint
+            return true;
+        }
+        return syncpoints.at(syncpoint_id).load() >= value;
+    });
 }
 
 void GPU::IncrementSyncPoint(const u32 syncpoint_id) {
@@ -523,6 +529,10 @@ void GPU::TriggerCpuInterrupt(const u32 syncpoint_id, const u32 value) const {
 }
 
 void GPU::ShutDown() {
+    // Signal that threads should no longer block on syncpoint fences
+    shutting_down.store(true, std::memory_order_relaxed);
+    sync_cv.notify_all();
+
     gpu_thread.ShutDown();
 }
 
diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h
index 29a867863..a8e98e51b 100644
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -389,6 +389,8 @@ private:
     std::unique_ptr<Engines::KeplerMemory> kepler_memory;
     /// Shader build notifier
     std::unique_ptr<VideoCore::ShaderNotify> shader_notify;
+    /// When true, we are about to shut down emulation session, so terminate outstanding tasks
+    std::atomic_bool shutting_down{};
 
     std::array<std::atomic<u32>, Service::Nvidia::MaxSyncPoints> syncpoints{};
 
diff --git a/src/video_core/rasterizer_accelerated.cpp b/src/video_core/rasterizer_accelerated.cpp
index 62d84c0f8..6decd2546 100644
--- a/src/video_core/rasterizer_accelerated.cpp
+++ b/src/video_core/rasterizer_accelerated.cpp
@@ -18,10 +18,10 @@ RasterizerAccelerated::~RasterizerAccelerated() = default;
 void RasterizerAccelerated::UpdatePagesCachedCount(VAddr addr, u64 size, int delta) {
     const auto page_end = Common::DivCeil(addr + size, Core::Memory::PAGE_SIZE);
     for (auto page = addr >> Core::Memory::PAGE_BITS; page != page_end; ++page) {
-        auto& count = cached_pages.at(page >> 3).Count(page);
+        auto& count = cached_pages.at(page >> 2).Count(page);
 
         if (delta > 0) {
-            ASSERT_MSG(count < UINT8_MAX, "Count may overflow!");
+            ASSERT_MSG(count < UINT16_MAX, "Count may overflow!");
         } else if (delta < 0) {
             ASSERT_MSG(count > 0, "Count may underflow!");
         } else {
@@ -29,7 +29,7 @@ void RasterizerAccelerated::UpdatePagesCachedCount(VAddr addr, u64 size, int del
         }
 
         // Adds or subtracts 1, as count is a unsigned 8-bit value
-        count += static_cast<u8>(delta);
+        count += static_cast<u16>(delta);
 
         // Assume delta is either -1 or 1
         if (count == 0) {
diff --git a/src/video_core/rasterizer_accelerated.h b/src/video_core/rasterizer_accelerated.h
index 9227a4adc..ea879bfdd 100644
--- a/src/video_core/rasterizer_accelerated.h
+++ b/src/video_core/rasterizer_accelerated.h
@@ -29,20 +29,20 @@ private:
     public:
         CacheEntry() = default;
 
-        std::atomic_uint8_t& Count(std::size_t page) {
-            return values[page & 7];
+        std::atomic_uint16_t& Count(std::size_t page) {
+            return values[page & 3];
         }
 
-        const std::atomic_uint8_t& Count(std::size_t page) const {
-            return values[page & 7];
+        const std::atomic_uint16_t& Count(std::size_t page) const {
+            return values[page & 3];
         }
 
     private:
-        std::array<std::atomic_uint8_t, 8> values{};
+        std::array<std::atomic_uint16_t, 4> values{};
     };
     static_assert(sizeof(CacheEntry) == 8, "CacheEntry should be 8 bytes!");
 
-    std::array<CacheEntry, 0x800000> cached_pages;
+    std::array<CacheEntry, 0x1000000> cached_pages;
     Core::Memory::Memory& cpu_memory;
 };
 
diff --git a/src/video_core/vulkan_common/vulkan_memory_allocator.cpp b/src/video_core/vulkan_common/vulkan_memory_allocator.cpp
index fa37aa79a..5edd06ebc 100644
--- a/src/video_core/vulkan_common/vulkan_memory_allocator.cpp
+++ b/src/video_core/vulkan_common/vulkan_memory_allocator.cpp
@@ -53,6 +53,18 @@ struct Range {
     UNREACHABLE_MSG("Invalid memory usage={}", usage);
     return VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
 }
+
+constexpr VkExportMemoryAllocateInfo EXPORT_ALLOCATE_INFO{
+    .sType = VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO,
+    .pNext = nullptr,
+#ifdef _WIN32
+    .handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT,
+#elif __unix__
+    .handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT,
+#else
+    .handleTypes = 0,
+#endif
+};
 } // Anonymous namespace
 
 class MemoryAllocation {
@@ -131,7 +143,7 @@ public:
 
     /// Returns whether this allocation is compatible with the arguments.
     [[nodiscard]] bool IsCompatible(VkMemoryPropertyFlags flags, u32 type_mask) const {
-        return (flags & property_flags) && (type_mask & shifted_memory_type) != 0;
+        return (flags & property_flags) == property_flags && (type_mask & shifted_memory_type) != 0;
     }
 
 private:
@@ -217,14 +229,18 @@ MemoryAllocator::~MemoryAllocator() = default;
 
 MemoryCommit MemoryAllocator::Commit(const VkMemoryRequirements& requirements, MemoryUsage usage) {
     // Find the fastest memory flags we can afford with the current requirements
-    const VkMemoryPropertyFlags flags = MemoryPropertyFlags(requirements.memoryTypeBits, usage);
+    const u32 type_mask = requirements.memoryTypeBits;
+    const VkMemoryPropertyFlags usage_flags = MemoryUsagePropertyFlags(usage);
+    const VkMemoryPropertyFlags flags = MemoryPropertyFlags(type_mask, usage_flags);
     if (std::optional<MemoryCommit> commit = TryCommit(requirements, flags)) {
         return std::move(*commit);
     }
     // Commit has failed, allocate more memory.
-    // TODO(Rodrigo): Handle out of memory situations in some way like flushing to guest memory.
-    AllocMemory(flags, requirements.memoryTypeBits, AllocationChunkSize(requirements.size));
-
+    const u64 chunk_size = AllocationChunkSize(requirements.size);
+    if (!TryAllocMemory(flags, type_mask, chunk_size)) {
+        // TODO(Rodrigo): Handle out of memory situations in some way like flushing to guest memory.
+        throw vk::Exception(VK_ERROR_OUT_OF_DEVICE_MEMORY);
+    }
     // Commit again, this time it won't fail since there's a fresh allocation above.
     // If it does, there's a bug.
     return TryCommit(requirements, flags).value();
@@ -242,26 +258,25 @@ MemoryCommit MemoryAllocator::Commit(const vk::Image& image, MemoryUsage usage)
     return commit;
 }
 
-void MemoryAllocator::AllocMemory(VkMemoryPropertyFlags flags, u32 type_mask, u64 size) {
+bool MemoryAllocator::TryAllocMemory(VkMemoryPropertyFlags flags, u32 type_mask, u64 size) {
     const u32 type = FindType(flags, type_mask).value();
-    const VkExportMemoryAllocateInfo export_allocate_info{
-        .sType = VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO,
-        .pNext = nullptr,
-#ifdef _WIN32
-        .handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT,
-#elif __unix__
-        .handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT,
-#else
-        .handleTypes = 0,
-#endif
-    };
-    vk::DeviceMemory memory = device.GetLogical().AllocateMemory({
+    vk::DeviceMemory memory = device.GetLogical().TryAllocateMemory({
         .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
-        .pNext = export_allocations ? &export_allocate_info : nullptr,
+        .pNext = export_allocations ? &EXPORT_ALLOCATE_INFO : nullptr,
         .allocationSize = size,
         .memoryTypeIndex = type,
     });
+    if (!memory) {
+        if ((flags & VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT) != 0) {
+            // Try to allocate non device local memory
+            return TryAllocMemory(flags & ~VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, type_mask, size);
+        } else {
+            // RIP
+            return false;
+        }
+    }
     allocations.push_back(std::make_unique<MemoryAllocation>(std::move(memory), flags, size, type));
+    return true;
 }
 
 std::optional<MemoryCommit> MemoryAllocator::TryCommit(const VkMemoryRequirements& requirements,
@@ -274,24 +289,24 @@ std::optional<MemoryCommit> MemoryAllocator::TryCommit(const VkMemoryRequirement
             return commit;
         }
     }
+    if ((flags & VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT) != 0) {
+        // Look for non device local commits on failure
+        return TryCommit(requirements, flags & ~VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
+    }
     return std::nullopt;
 }
 
-VkMemoryPropertyFlags MemoryAllocator::MemoryPropertyFlags(u32 type_mask, MemoryUsage usage) const {
-    return MemoryPropertyFlags(type_mask, MemoryUsagePropertyFlags(usage));
-}
-
 VkMemoryPropertyFlags MemoryAllocator::MemoryPropertyFlags(u32 type_mask,
                                                            VkMemoryPropertyFlags flags) const {
     if (FindType(flags, type_mask)) {
         // Found a memory type with those requirements
         return flags;
     }
-    if (flags & VK_MEMORY_PROPERTY_HOST_CACHED_BIT) {
+    if ((flags & VK_MEMORY_PROPERTY_HOST_CACHED_BIT) != 0) {
         // Remove host cached bit in case it's not supported
         return MemoryPropertyFlags(type_mask, flags & ~VK_MEMORY_PROPERTY_HOST_CACHED_BIT);
     }
-    if (flags & VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT) {
+    if ((flags & VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT) != 0) {
         // Remove device local, if it's not supported by the requested resource
         return MemoryPropertyFlags(type_mask, flags & ~VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
     }
@@ -302,7 +317,7 @@ VkMemoryPropertyFlags MemoryAllocator::MemoryPropertyFlags(u32 type_mask,
 std::optional<u32> MemoryAllocator::FindType(VkMemoryPropertyFlags flags, u32 type_mask) const {
     for (u32 type_index = 0; type_index < properties.memoryTypeCount; ++type_index) {
         const VkMemoryPropertyFlags type_flags = properties.memoryTypes[type_index].propertyFlags;
-        if ((type_mask & (1U << type_index)) && (type_flags & flags)) {
+        if ((type_mask & (1U << type_index)) != 0 && (type_flags & flags) == flags) {
             // The type matches in type and in the wanted properties.
             return type_index;
         }
diff --git a/src/video_core/vulkan_common/vulkan_memory_allocator.h b/src/video_core/vulkan_common/vulkan_memory_allocator.h
index d1ce29450..db12d02f4 100644
--- a/src/video_core/vulkan_common/vulkan_memory_allocator.h
+++ b/src/video_core/vulkan_common/vulkan_memory_allocator.h
@@ -101,16 +101,13 @@ public:
     MemoryCommit Commit(const vk::Image& image, MemoryUsage usage);
 
 private:
-    /// Allocates a chunk of memory.
-    void AllocMemory(VkMemoryPropertyFlags flags, u32 type_mask, u64 size);
+    /// Tries to allocate a chunk of memory.
+    bool TryAllocMemory(VkMemoryPropertyFlags flags, u32 type_mask, u64 size);
 
     /// Tries to allocate a memory commit.
     std::optional<MemoryCommit> TryCommit(const VkMemoryRequirements& requirements,
                                           VkMemoryPropertyFlags flags);
 
-    /// Returns the fastest compatible memory property flags from a wanted usage.
-    VkMemoryPropertyFlags MemoryPropertyFlags(u32 type_mask, MemoryUsage usage) const;
-
     /// Returns the fastest compatible memory property flags from the wanted flags.
     VkMemoryPropertyFlags MemoryPropertyFlags(u32 type_mask, VkMemoryPropertyFlags flags) const;