1 files changed, 58 insertions, 13 deletions
diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h
index fa26eb8b0..3f2bf6294 100644
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@@ -11,7 +11,6 @@
 #include <mutex>
 #include <numeric>
 #include <span>
-#include <unordered_map>
 #include <vector>
 
 #include <boost/container/small_vector.hpp>
@@ -22,7 +21,6 @@
 #include "common/literals.h"
 #include "common/lru_cache.h"
 #include "common/microprofile.h"
-#include "common/scope_exit.h"
 #include "common/settings.h"
 #include "core/memory.h"
 #include "video_core/buffer_cache/buffer_base.h"
@@ -78,8 +76,9 @@ class BufferCache {
 
     static constexpr BufferId NULL_BUFFER_ID{0};
 
-    static constexpr u64 EXPECTED_MEMORY = 512_MiB;
-    static constexpr u64 CRITICAL_MEMORY = 1_GiB;
+    static constexpr s64 DEFAULT_EXPECTED_MEMORY = 512_MiB;
+    static constexpr s64 DEFAULT_CRITICAL_MEMORY = 1_GiB;
+    static constexpr s64 TARGET_THRESHOLD = 4_GiB;
 
     using Maxwell = Tegra::Engines::Maxwell3D::Regs;
 
@@ -438,6 +437,8 @@ private:
     Common::LeastRecentlyUsedCache<LRUItemParams> lru_cache;
     u64 frame_tick = 0;
     u64 total_used_memory = 0;
+    u64 minimum_memory = 0;
+    u64 critical_memory = 0;
 
     std::array<BufferId, ((1ULL << 39) >> PAGE_BITS)> page_table;
 };
@@ -453,11 +454,30 @@ BufferCache<P>::BufferCache(VideoCore::RasterizerInterface& rasterizer_,
     // Ensure the first slot is used for the null buffer
     void(slot_buffers.insert(runtime, NullBufferParams{}));
     common_ranges.clear();
+
+    if (!runtime.CanReportMemoryUsage()) {
+        minimum_memory = DEFAULT_EXPECTED_MEMORY;
+        critical_memory = DEFAULT_CRITICAL_MEMORY;
+        return;
+    }
+
+    const s64 device_memory = static_cast<s64>(runtime.GetDeviceLocalMemory());
+    const s64 min_spacing_expected = device_memory - 1_GiB - 512_MiB;
+    const s64 min_spacing_critical = device_memory - 1_GiB;
+    const s64 mem_threshold = std::min(device_memory, TARGET_THRESHOLD);
+    const s64 min_vacancy_expected = (6 * mem_threshold) / 10;
+    const s64 min_vacancy_critical = (3 * mem_threshold) / 10;
+    minimum_memory = static_cast<u64>(
+        std::max(std::min(device_memory - min_vacancy_expected, min_spacing_expected),
+                 DEFAULT_EXPECTED_MEMORY));
+    critical_memory = static_cast<u64>(
+        std::max(std::min(device_memory - min_vacancy_critical, min_spacing_critical),
+                 DEFAULT_CRITICAL_MEMORY));
 }
 
 template <class P>
 void BufferCache<P>::RunGarbageCollector() {
-    const bool aggressive_gc = total_used_memory >= CRITICAL_MEMORY;
+    const bool aggressive_gc = total_used_memory >= critical_memory;
     const u64 ticks_to_destroy = aggressive_gc ? 60 : 120;
     int num_iterations = aggressive_gc ? 64 : 32;
     const auto clean_up = [this, &num_iterations](BufferId buffer_id) {
@@ -488,7 +508,11 @@ void BufferCache<P>::TickFrame() {
     const bool skip_preferred = hits * 256 < shots * 251;
     uniform_buffer_skip_cache_size = skip_preferred ? DEFAULT_SKIP_CACHE_SIZE : 0;
 
-    if (total_used_memory >= EXPECTED_MEMORY) {
+    // If we can obtain the memory info, use it instead of the estimate.
+    if (runtime.CanReportMemoryUsage()) {
+        total_used_memory = runtime.GetDeviceMemoryUsage();
+    }
+    if (total_used_memory >= minimum_memory) {
         RunGarbageCollector();
     }
     ++frame_tick;
@@ -1287,7 +1311,20 @@ void BufferCache<P>::UpdateVertexBuffer(u32 index) {
     const GPUVAddr gpu_addr_begin = array.StartAddress();
     const GPUVAddr gpu_addr_end = limit.LimitAddress() + 1;
     const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr_begin);
-    const u32 address_size = static_cast<u32>(gpu_addr_end - gpu_addr_begin);
+    u32 address_size = static_cast<u32>(gpu_addr_end - gpu_addr_begin);
+    if (address_size >= 64_MiB) {
+        // Reported vertex buffer size is very large, cap to mapped buffer size
+        GPUVAddr submapped_addr_end = gpu_addr_begin;
+
+        const auto ranges{gpu_memory.GetSubmappedRange(gpu_addr_begin, address_size)};
+        if (ranges.size() > 0) {
+            const auto& [addr, size] = *ranges.begin();
+            submapped_addr_end = addr + size;
+        }
+
+        address_size =
+            std::min(address_size, static_cast<u32>(submapped_addr_end - gpu_addr_begin));
+    }
     const u32 size = address_size; // TODO: Analyze stride and number of vertices
     if (array.enable == 0 || size == 0 || !cpu_addr) {
         vertex_buffers[index] = NULL_BINDING;
@@ -1469,19 +1506,27 @@ typename BufferCache<P>::OverlapResult BufferCache<P>::ResolveOverlaps(VAddr cpu
         overlap_ids.push_back(overlap_id);
         overlap.Pick();
         const VAddr overlap_cpu_addr = overlap.CpuAddr();
-        if (overlap_cpu_addr < begin) {
+        const bool expands_left = overlap_cpu_addr < begin;
+        if (expands_left) {
             cpu_addr = begin = overlap_cpu_addr;
         }
-        end = std::max(end, overlap_cpu_addr + overlap.SizeBytes());
-
+        const VAddr overlap_end = overlap_cpu_addr + overlap.SizeBytes();
+        const bool expands_right = overlap_end > end;
+        if (overlap_end > end) {
+            end = overlap_end;
+        }
         stream_score += overlap.StreamScore();
         if (stream_score > STREAM_LEAP_THRESHOLD && !has_stream_leap) {
             // When this memory region has been joined a bunch of times, we assume it's being used
             // as a stream buffer. Increase the size to skip constantly recreating buffers.
             has_stream_leap = true;
-            begin -= PAGE_SIZE * 256;
-            cpu_addr = begin;
-            end += PAGE_SIZE * 256;
+            if (expands_right) {
+                begin -= PAGE_SIZE * 256;
+                cpu_addr = begin;
+            }
+            if (expands_left) {
+                end += PAGE_SIZE * 256;
+            }
         }
     }
     return OverlapResult{