summaryrefslogtreecommitdiffstats
path: root/src/video_core/texture_cache/texture_cache.h
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/video_core/texture_cache/texture_cache.h691
1 files changed, 534 insertions, 157 deletions
diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h
index 1b01990a4..d3f03a995 100644
--- a/src/video_core/texture_cache/texture_cache.h
+++ b/src/video_core/texture_cache/texture_cache.h
@@ -1,9 +1,10 @@
-// SPDX-FileCopyrightText: 2021 yuzu Emulator Project
+// SPDX-FileCopyrightText: 2023 yuzu Emulator Project
// SPDX-License-Identifier: GPL-3.0-or-later
#pragma once
#include <unordered_set>
+#include <boost/container/small_vector.hpp>
#include "common/alignment.h"
#include "common/settings.h"
@@ -17,15 +18,10 @@
namespace VideoCommon {
-using Tegra::Texture::SwizzleSource;
-using Tegra::Texture::TextureType;
using Tegra::Texture::TICEntry;
using Tegra::Texture::TSCEntry;
using VideoCore::Surface::GetFormatType;
-using VideoCore::Surface::IsCopyCompatible;
using VideoCore::Surface::PixelFormat;
-using VideoCore::Surface::PixelFormatFromDepthFormat;
-using VideoCore::Surface::PixelFormatFromRenderTargetFormat;
using VideoCore::Surface::SurfaceType;
using namespace Common::Literals;
@@ -53,8 +49,8 @@ TextureCache<P>::TextureCache(Runtime& runtime_, VideoCore::RasterizerInterface&
if constexpr (HAS_DEVICE_MEMORY_INFO) {
const s64 device_memory = static_cast<s64>(runtime.GetDeviceLocalMemory());
- const s64 min_spacing_expected = device_memory - 1_GiB - 512_MiB;
- const s64 min_spacing_critical = device_memory - 1_GiB;
+ const s64 min_spacing_expected = device_memory - 1_GiB;
+ const s64 min_spacing_critical = device_memory - 512_MiB;
const s64 mem_threshold = std::min(device_memory, TARGET_THRESHOLD);
const s64 min_vacancy_expected = (6 * mem_threshold) / 10;
const s64 min_vacancy_critical = (3 * mem_threshold) / 10;
@@ -85,10 +81,17 @@ void TextureCache<P>::RunGarbageCollector() {
}
--num_iterations;
auto& image = slot_images[image_id];
+ if (True(image.flags & ImageFlagBits::IsDecoding)) {
+ // This image is still being decoded, deleting it will invalidate the slot
+ // used by the async decoder thread.
+ return false;
+ }
+ if (!aggressive_mode && True(image.flags & ImageFlagBits::CostlyLoad)) {
+ return false;
+ }
const bool must_download =
image.IsSafeDownload() && False(image.flags & ImageFlagBits::BadOverlap);
- if (!high_priority_mode &&
- (must_download || True(image.flags & ImageFlagBits::CostlyLoad))) {
+ if (!high_priority_mode && must_download) {
return false;
}
if (must_download) {
@@ -133,9 +136,17 @@ void TextureCache<P>::TickFrame() {
sentenced_images.Tick();
sentenced_framebuffers.Tick();
sentenced_image_view.Tick();
+ TickAsyncDecode();
+
runtime.TickFrame();
- critical_gc = 0;
++frame_tick;
+
+ if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) {
+ for (auto& buffer : async_buffers_death_ring) {
+ runtime.FreeDeferredStagingBuffer(buffer);
+ }
+ async_buffers_death_ring.clear();
+ }
}
template <class P>
@@ -174,31 +185,91 @@ void TextureCache<P>::FillComputeImageViews(std::span<ImageViewInOut> views) {
}
template <class P>
+void TextureCache<P>::CheckFeedbackLoop(std::span<const ImageViewInOut> views) {
+ if (!Settings::values.barrier_feedback_loops.GetValue()) {
+ return;
+ }
+
+ const bool requires_barrier = [&] {
+ for (const auto& view : views) {
+ if (!view.id) {
+ continue;
+ }
+ auto& image_view = slot_image_views[view.id];
+
+ // Check color targets
+ for (const auto& ct_view_id : render_targets.color_buffer_ids) {
+ if (ct_view_id) {
+ auto& ct_view = slot_image_views[ct_view_id];
+ if (image_view.image_id == ct_view.image_id) {
+ return true;
+ }
+ }
+ }
+
+ // Check zeta target
+ if (render_targets.depth_buffer_id) {
+ auto& zt_view = slot_image_views[render_targets.depth_buffer_id];
+ if (image_view.image_id == zt_view.image_id) {
+ return true;
+ }
+ }
+ }
+
+ return false;
+ }();
+
+ if (requires_barrier) {
+ runtime.BarrierFeedbackLoop();
+ }
+}
+
+template <class P>
typename P::Sampler* TextureCache<P>::GetGraphicsSampler(u32 index) {
+ return &slot_samplers[GetGraphicsSamplerId(index)];
+}
+
+template <class P>
+typename P::Sampler* TextureCache<P>::GetComputeSampler(u32 index) {
+ return &slot_samplers[GetComputeSamplerId(index)];
+}
+
+template <class P>
+SamplerId TextureCache<P>::GetGraphicsSamplerId(u32 index) {
if (index > channel_state->graphics_sampler_table.Limit()) {
LOG_DEBUG(HW_GPU, "Invalid sampler index={}", index);
- return &slot_samplers[NULL_SAMPLER_ID];
+ return NULL_SAMPLER_ID;
}
const auto [descriptor, is_new] = channel_state->graphics_sampler_table.Read(index);
SamplerId& id = channel_state->graphics_sampler_ids[index];
if (is_new) {
id = FindSampler(descriptor);
}
- return &slot_samplers[id];
+ return id;
}
template <class P>
-typename P::Sampler* TextureCache<P>::GetComputeSampler(u32 index) {
+SamplerId TextureCache<P>::GetComputeSamplerId(u32 index) {
if (index > channel_state->compute_sampler_table.Limit()) {
LOG_DEBUG(HW_GPU, "Invalid sampler index={}", index);
- return &slot_samplers[NULL_SAMPLER_ID];
+ return NULL_SAMPLER_ID;
}
const auto [descriptor, is_new] = channel_state->compute_sampler_table.Read(index);
SamplerId& id = channel_state->compute_sampler_ids[index];
if (is_new) {
id = FindSampler(descriptor);
}
- return &slot_samplers[id];
+ return id;
+}
+
+template <class P>
+const typename P::Sampler& TextureCache<P>::GetSampler(SamplerId id) const noexcept {
+ return slot_samplers[id];
+}
+
+template <class P>
+typename P::Sampler& TextureCache<P>::GetSampler(SamplerId id) noexcept {
+ return slot_samplers[id];
}
template <class P>
@@ -233,7 +304,7 @@ void TextureCache<P>::SynchronizeComputeDescriptors() {
}
template <class P>
-bool TextureCache<P>::RescaleRenderTargets(bool is_clear) {
+bool TextureCache<P>::RescaleRenderTargets() {
auto& flags = maxwell3d->dirty.flags;
u32 scale_rating = 0;
bool rescaled = false;
@@ -271,13 +342,13 @@ bool TextureCache<P>::RescaleRenderTargets(bool is_clear) {
ImageViewId& color_buffer_id = render_targets.color_buffer_ids[index];
if (flags[Dirty::ColorBuffer0 + index] || force) {
flags[Dirty::ColorBuffer0 + index] = false;
- BindRenderTarget(&color_buffer_id, FindColorBuffer(index, is_clear));
+ BindRenderTarget(&color_buffer_id, FindColorBuffer(index));
}
check_rescale(color_buffer_id, tmp_color_images[index]);
}
if (flags[Dirty::ZetaBuffer] || force) {
flags[Dirty::ZetaBuffer] = false;
- BindRenderTarget(&render_targets.depth_buffer_id, FindDepthBuffer(is_clear));
+ BindRenderTarget(&render_targets.depth_buffer_id, FindDepthBuffer());
}
check_rescale(render_targets.depth_buffer_id, tmp_depth_image);
@@ -342,7 +413,7 @@ void TextureCache<P>::UpdateRenderTargets(bool is_clear) {
return;
}
- const bool rescaled = RescaleRenderTargets(is_clear);
+ const bool rescaled = RescaleRenderTargets();
if (is_rescaling != rescaled) {
flags[Dirty::RescaleViewports] = true;
flags[Dirty::RescaleScissors] = true;
@@ -455,7 +526,7 @@ void TextureCache<P>::WriteMemory(VAddr cpu_addr, size_t size) {
template <class P>
void TextureCache<P>::DownloadMemory(VAddr cpu_addr, size_t size) {
- std::vector<ImageId> images;
+ boost::container::small_vector<ImageId, 16> images;
ForEachImageInRegion(cpu_addr, size, [&images](ImageId image_id, ImageBase& image) {
if (!image.IsSafeDownload()) {
return;
@@ -481,8 +552,34 @@ void TextureCache<P>::DownloadMemory(VAddr cpu_addr, size_t size) {
}
template <class P>
+std::optional<VideoCore::RasterizerDownloadArea> TextureCache<P>::GetFlushArea(VAddr cpu_addr,
+ u64 size) {
+ std::optional<VideoCore::RasterizerDownloadArea> area{};
+ ForEachImageInRegion(cpu_addr, size, [&](ImageId, ImageBase& image) {
+ if (False(image.flags & ImageFlagBits::GpuModified)) {
+ return;
+ }
+ if (!area) {
+ area.emplace();
+ area->start_address = cpu_addr;
+ area->end_address = cpu_addr + size;
+ area->preemtive = true;
+ }
+ area->start_address = std::min(area->start_address, image.cpu_addr);
+ area->end_address = std::max(area->end_address, image.cpu_addr_end);
+ for (auto image_view_id : image.image_view_ids) {
+ auto& image_view = slot_image_views[image_view_id];
+ image_view.flags |= ImageViewFlagBits::PreemtiveDownload;
+ }
+ area->preemtive &= image.info.forced_flushed;
+ image.info.forced_flushed = true;
+ });
+ return area;
+}
+
+template <class P>
void TextureCache<P>::UnmapMemory(VAddr cpu_addr, size_t size) {
- std::vector<ImageId> deleted_images;
+ boost::container::small_vector<ImageId, 16> deleted_images;
ForEachImageInRegion(cpu_addr, size, [&](ImageId id, Image&) { deleted_images.push_back(id); });
for (const ImageId id : deleted_images) {
Image& image = slot_images[id];
@@ -496,7 +593,7 @@ void TextureCache<P>::UnmapMemory(VAddr cpu_addr, size_t size) {
template <class P>
void TextureCache<P>::UnmapGPUMemory(size_t as_id, GPUVAddr gpu_addr, size_t size) {
- std::vector<ImageId> deleted_images;
+ boost::container::small_vector<ImageId, 16> deleted_images;
ForEachImageInRegionGPU(as_id, gpu_addr, size,
[&](ImageId id, Image&) { deleted_images.push_back(id); });
for (const ImageId id : deleted_images) {
@@ -654,25 +751,41 @@ template <class P>
void TextureCache<P>::CommitAsyncFlushes() {
// This is intentionally passing the value by copy
if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) {
- const std::span<const ImageId> download_ids = uncommitted_downloads;
+ auto& download_ids = uncommitted_downloads;
if (download_ids.empty()) {
committed_downloads.emplace_back(std::move(uncommitted_downloads));
uncommitted_downloads.clear();
- async_buffers.emplace_back(std::optional<AsyncBuffer>{});
+ async_buffers.emplace_back(std::move(uncommitted_async_buffers));
+ uncommitted_async_buffers.clear();
return;
}
size_t total_size_bytes = 0;
- for (const ImageId image_id : download_ids) {
- total_size_bytes += slot_images[image_id].unswizzled_size_bytes;
+ size_t last_async_buffer_id = uncommitted_async_buffers.size();
+ bool any_none_dma = false;
+ for (PendingDownload& download_info : download_ids) {
+ if (download_info.is_swizzle) {
+ total_size_bytes +=
+ Common::AlignUp(slot_images[download_info.object_id].unswizzled_size_bytes, 64);
+ any_none_dma = true;
+ download_info.async_buffer_id = last_async_buffer_id;
+ }
}
- auto download_map = runtime.DownloadStagingBuffer(total_size_bytes, true);
- for (const ImageId image_id : download_ids) {
- Image& image = slot_images[image_id];
- const auto copies = FullDownloadCopies(image.info);
- image.DownloadMemory(download_map, copies);
- download_map.offset += Common::AlignUp(image.unswizzled_size_bytes, 64);
+
+ if (any_none_dma) {
+ auto download_map = runtime.DownloadStagingBuffer(total_size_bytes, true);
+ for (const PendingDownload& download_info : download_ids) {
+ if (download_info.is_swizzle) {
+ Image& image = slot_images[download_info.object_id];
+ const auto copies = FullDownloadCopies(image.info);
+ image.DownloadMemory(download_map, copies);
+ download_map.offset += Common::AlignUp(image.unswizzled_size_bytes, 64);
+ }
+ }
+ uncommitted_async_buffers.emplace_back(download_map);
}
- async_buffers.emplace_back(download_map);
+
+ async_buffers.emplace_back(std::move(uncommitted_async_buffers));
+ uncommitted_async_buffers.clear();
}
committed_downloads.emplace_back(std::move(uncommitted_downloads));
uncommitted_downloads.clear();
@@ -684,39 +797,57 @@ void TextureCache<P>::PopAsyncFlushes() {
return;
}
if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) {
- const std::span<const ImageId> download_ids = committed_downloads.front();
+ const auto& download_ids = committed_downloads.front();
if (download_ids.empty()) {
committed_downloads.pop_front();
async_buffers.pop_front();
return;
}
- auto download_map = *async_buffers.front();
- std::span<u8> download_span = download_map.mapped_span;
+ auto download_map = std::move(async_buffers.front());
for (size_t i = download_ids.size(); i > 0; i--) {
- const ImageBase& image = slot_images[download_ids[i - 1]];
- const auto copies = FullDownloadCopies(image.info);
- download_map.offset -= Common::AlignUp(image.unswizzled_size_bytes, 64);
- std::span<u8> download_span_alt = download_span.subspan(download_map.offset);
- SwizzleImage(*gpu_memory, image.gpu_addr, image.info, copies, download_span_alt,
- swizzle_data_buffer);
+ auto& download_info = download_ids[i - 1];
+ auto& download_buffer = download_map[download_info.async_buffer_id];
+ if (download_info.is_swizzle) {
+ const ImageBase& image = slot_images[download_info.object_id];
+ const auto copies = FullDownloadCopies(image.info);
+ download_buffer.offset -= Common::AlignUp(image.unswizzled_size_bytes, 64);
+ std::span<u8> download_span =
+ download_buffer.mapped_span.subspan(download_buffer.offset);
+ SwizzleImage(*gpu_memory, image.gpu_addr, image.info, copies, download_span,
+ swizzle_data_buffer);
+ } else {
+ const BufferDownload& buffer_info = slot_buffer_downloads[download_info.object_id];
+ std::span<u8> download_span =
+ download_buffer.mapped_span.subspan(download_buffer.offset);
+ gpu_memory->WriteBlockUnsafe(buffer_info.address, download_span.data(),
+ buffer_info.size);
+ slot_buffer_downloads.erase(download_info.object_id);
+ }
+ }
+ for (auto& download_buffer : download_map) {
+ async_buffers_death_ring.emplace_back(download_buffer);
}
- runtime.FreeDeferredStagingBuffer(download_map);
committed_downloads.pop_front();
async_buffers.pop_front();
} else {
- const std::span<const ImageId> download_ids = committed_downloads.front();
+ const auto& download_ids = committed_downloads.front();
if (download_ids.empty()) {
committed_downloads.pop_front();
return;
}
size_t total_size_bytes = 0;
- for (const ImageId image_id : download_ids) {
- total_size_bytes += slot_images[image_id].unswizzled_size_bytes;
+ for (const PendingDownload& download_info : download_ids) {
+ if (download_info.is_swizzle) {
+ total_size_bytes += slot_images[download_info.object_id].unswizzled_size_bytes;
+ }
}
auto download_map = runtime.DownloadStagingBuffer(total_size_bytes);
const size_t original_offset = download_map.offset;
- for (const ImageId image_id : download_ids) {
- Image& image = slot_images[image_id];
+ for (const PendingDownload& download_info : download_ids) {
+ if (!download_info.is_swizzle) {
+ continue;
+ }
+ Image& image = slot_images[download_info.object_id];
const auto copies = FullDownloadCopies(image.info);
image.DownloadMemory(download_map, copies);
download_map.offset += image.unswizzled_size_bytes;
@@ -725,8 +856,11 @@ void TextureCache<P>::PopAsyncFlushes() {
runtime.Finish();
download_map.offset = original_offset;
std::span<u8> download_span = download_map.mapped_span;
- for (const ImageId image_id : download_ids) {
- const ImageBase& image = slot_images[image_id];
+ for (const PendingDownload& download_info : download_ids) {
+ if (!download_info.is_swizzle) {
+ continue;
+ }
+ const ImageBase& image = slot_images[download_info.object_id];
const auto copies = FullDownloadCopies(image.info);
SwizzleImage(*gpu_memory, image.gpu_addr, image.info, copies, download_span,
swizzle_data_buffer);
@@ -738,6 +872,26 @@ void TextureCache<P>::PopAsyncFlushes() {
}
template <class P>
+ImageId TextureCache<P>::DmaImageId(const Tegra::DMA::ImageOperand& operand, bool is_upload) {
+ const ImageInfo dst_info(operand);
+ const ImageId image_id = FindDMAImage(dst_info, operand.address);
+ if (!image_id) {
+ return NULL_IMAGE_ID;
+ }
+ auto& image = slot_images[image_id];
+ if (!is_upload && !image.info.dma_downloaded) {
+ // Force a full sync.
+ image.info.dma_downloaded = true;
+ return NULL_IMAGE_ID;
+ }
+ const auto base = image.TryFindBase(operand.address);
+ if (!base) {
+ return NULL_IMAGE_ID;
+ }
+ return image_id;
+}
+
+template <class P>
bool TextureCache<P>::IsRescaling() const noexcept {
return is_rescaling;
}
@@ -765,6 +919,76 @@ bool TextureCache<P>::IsRegionGpuModified(VAddr addr, size_t size) {
}
template <class P>
+std::pair<typename TextureCache<P>::Image*, BufferImageCopy> TextureCache<P>::DmaBufferImageCopy(
+ const Tegra::DMA::ImageCopy& copy_info, const Tegra::DMA::BufferOperand& buffer_operand,
+ const Tegra::DMA::ImageOperand& image_operand, ImageId image_id, bool modifies_image) {
+ const auto [level, base] = PrepareDmaImage(image_id, image_operand.address, modifies_image);
+ auto* image = &slot_images[image_id];
+ const u32 buffer_size = static_cast<u32>(buffer_operand.pitch * buffer_operand.height);
+ const u32 bpp = VideoCore::Surface::BytesPerBlock(image->info.format);
+ const auto convert = [old_bpp = image_operand.bytes_per_pixel, bpp](u32 value) {
+ return (old_bpp * value) / bpp;
+ };
+ const u32 base_x = convert(image_operand.params.origin.x.Value());
+ const u32 base_y = image_operand.params.origin.y.Value();
+ const u32 length_x = convert(copy_info.length_x);
+ const u32 length_y = copy_info.length_y;
+
+ const BufferImageCopy copy{
+ .buffer_offset = 0,
+ .buffer_size = buffer_size,
+ .buffer_row_length = convert(buffer_operand.pitch),
+ .buffer_image_height = buffer_operand.height,
+ .image_subresource =
+ {
+ .base_level = static_cast<s32>(level),
+ .base_layer = static_cast<s32>(base),
+ .num_layers = 1,
+ },
+ .image_offset =
+ {
+ .x = static_cast<s32>(base_x),
+ .y = static_cast<s32>(base_y),
+ .z = 0,
+ },
+ .image_extent =
+ {
+ .width = length_x,
+ .height = length_y,
+ .depth = 1,
+ },
+ };
+ return {image, copy};
+}
+
+template <class P>
+void TextureCache<P>::DownloadImageIntoBuffer(typename TextureCache<P>::Image* image,
+ typename TextureCache<P>::BufferType buffer,
+ size_t buffer_offset,
+ std::span<const VideoCommon::BufferImageCopy> copies,
+ GPUVAddr address, size_t size) {
+ if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) {
+ const BufferDownload new_buffer_download{address, size};
+ auto slot = slot_buffer_downloads.insert(new_buffer_download);
+ const PendingDownload new_download{false, uncommitted_async_buffers.size(), slot};
+ uncommitted_downloads.emplace_back(new_download);
+ auto download_map = runtime.DownloadStagingBuffer(size, true);
+ uncommitted_async_buffers.emplace_back(download_map);
+ std::array buffers{
+ buffer,
+ download_map.buffer,
+ };
+ std::array<u64, 2> buffer_offsets{
+ buffer_offset,
+ download_map.offset,
+ };
+ image->DownloadMemory(buffers, buffer_offsets, copies);
+ } else {
+ image->DownloadMemory(buffer, buffer_offset, copies);
+ }
+}
+
+template <class P>
void TextureCache<P>::RefreshContents(Image& image, ImageId image_id) {
if (False(image.flags & ImageFlagBits::CpuModified)) {
// Only upload modified images
@@ -773,10 +997,14 @@ void TextureCache<P>::RefreshContents(Image& image, ImageId image_id) {
image.flags &= ~ImageFlagBits::CpuModified;
TrackImage(image, image_id);
- if (image.info.num_samples > 1) {
+ if (image.info.num_samples > 1 && !runtime.CanUploadMSAA()) {
LOG_WARNING(HW_GPU, "MSAA image uploads are not implemented");
return;
}
+ if (True(image.flags & ImageFlagBits::AsynchronousDecode)) {
+ QueueAsyncDecode(image, image_id);
+ return;
+ }
auto staging = runtime.UploadStagingBuffer(MapSizeBytes(image));
UploadImageContents(image, staging);
runtime.InsertUploadMemoryBarrier();
@@ -873,7 +1101,7 @@ ImageId TextureCache<P>::FindImage(const ImageInfo& info, GPUVAddr gpu_addr,
const bool native_bgr = runtime.HasNativeBgr();
const bool flexible_formats = True(options & RelaxedOptions::Format);
ImageId image_id{};
- boost::container::small_vector<ImageId, 1> image_ids;
+ boost::container::small_vector<ImageId, 8> image_ids;
const auto lambda = [&](ImageId existing_image_id, ImageBase& existing_image) {
if (True(existing_image.flags & ImageFlagBits::Remapped)) {
return false;
@@ -990,6 +1218,65 @@ u64 TextureCache<P>::GetScaledImageSizeBytes(const ImageBase& image) {
}
template <class P>
+void TextureCache<P>::QueueAsyncDecode(Image& image, ImageId image_id) {
+ UNIMPLEMENTED_IF(False(image.flags & ImageFlagBits::Converted));
+ LOG_INFO(HW_GPU, "Queuing async texture decode");
+
+ image.flags |= ImageFlagBits::IsDecoding;
+ auto decode = std::make_unique<AsyncDecodeContext>();
+ auto* decode_ptr = decode.get();
+ decode->image_id = image_id;
+ async_decodes.push_back(std::move(decode));
+
+ Common::ScratchBuffer<u8> local_unswizzle_data_buffer(image.unswizzled_size_bytes);
+ const size_t guest_size_bytes = image.guest_size_bytes;
+ swizzle_data_buffer.resize_destructive(guest_size_bytes);
+ gpu_memory->ReadBlockUnsafe(image.gpu_addr, swizzle_data_buffer.data(), guest_size_bytes);
+ auto copies = UnswizzleImage(*gpu_memory, image.gpu_addr, image.info, swizzle_data_buffer,
+ local_unswizzle_data_buffer);
+ const size_t out_size = MapSizeBytes(image);
+
+ auto func = [out_size, copies, info = image.info,
+ input = std::move(local_unswizzle_data_buffer),
+ async_decode = decode_ptr]() mutable {
+ async_decode->decoded_data.resize_destructive(out_size);
+ std::span copies_span{copies.data(), copies.size()};
+ ConvertImage(input, info, async_decode->decoded_data, copies_span);
+
+ // TODO: Do we need this lock?
+ std::unique_lock lock{async_decode->mutex};
+ async_decode->copies = std::move(copies);
+ async_decode->complete = true;
+ };
+ texture_decode_worker.QueueWork(std::move(func));
+}
+
+template <class P>
+void TextureCache<P>::TickAsyncDecode() {
+ bool has_uploads{};
+ auto i = async_decodes.begin();
+ while (i != async_decodes.end()) {
+ auto* async_decode = i->get();
+ std::unique_lock lock{async_decode->mutex};
+ if (!async_decode->complete) {
+ ++i;
+ continue;
+ }
+ Image& image = slot_images[async_decode->image_id];
+ auto staging = runtime.UploadStagingBuffer(MapSizeBytes(image));
+ std::memcpy(staging.mapped_span.data(), async_decode->decoded_data.data(),
+ async_decode->decoded_data.size());
+ image.UploadMemory(staging, async_decode->copies);
+ image.flags &= ~ImageFlagBits::IsDecoding;
+ has_uploads = true;
+ i = async_decodes.erase(i);
+ }
+ if (has_uploads) {
+ runtime.InsertUploadMemoryBarrier();
+ }
+}
+
+template <class P>
bool TextureCache<P>::ScaleUp(Image& image) {
const bool has_copy = image.HasScaled();
const bool rescaled = image.ScaleUp();
@@ -1044,17 +1331,18 @@ ImageId TextureCache<P>::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VA
const size_t size_bytes = CalculateGuestSizeInBytes(new_info);
const bool broken_views = runtime.HasBrokenTextureViewFormats();
const bool native_bgr = runtime.HasNativeBgr();
- std::vector<ImageId> overlap_ids;
- std::unordered_set<ImageId> overlaps_found;
- std::vector<ImageId> left_aliased_ids;
- std::vector<ImageId> right_aliased_ids;
- std::unordered_set<ImageId> ignore_textures;
- std::vector<ImageId> bad_overlap_ids;
- std::vector<ImageId> all_siblings;
+ join_overlap_ids.clear();
+ join_overlaps_found.clear();
+ join_left_aliased_ids.clear();
+ join_right_aliased_ids.clear();
+ join_ignore_textures.clear();
+ join_bad_overlap_ids.clear();
+ join_copies_to_do.clear();
+ join_alias_indices.clear();
const bool this_is_linear = info.type == ImageType::Linear;
const auto region_check = [&](ImageId overlap_id, ImageBase& overlap) {
if (True(overlap.flags & ImageFlagBits::Remapped)) {
- ignore_textures.insert(overlap_id);
+ join_ignore_textures.insert(overlap_id);
return;
}
const bool overlap_is_linear = overlap.info.type == ImageType::Linear;
@@ -1064,11 +1352,11 @@ ImageId TextureCache<P>::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VA
if (this_is_linear && overlap_is_linear) {
if (info.pitch == overlap.info.pitch && gpu_addr == overlap.gpu_addr) {
// Alias linear images with the same pitch
- left_aliased_ids.push_back(overlap_id);
+ join_left_aliased_ids.push_back(overlap_id);
}
return;
}
- overlaps_found.insert(overlap_id);
+ join_overlaps_found.insert(overlap_id);
static constexpr bool strict_size = true;
const std::optional<OverlapResult> solution = ResolveOverlap(
new_info, gpu_addr, cpu_addr, overlap, strict_size, broken_views, native_bgr);
@@ -1076,34 +1364,33 @@ ImageId TextureCache<P>::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VA
gpu_addr = solution->gpu_addr;
cpu_addr = solution->cpu_addr;
new_info.resources = solution->resources;
- overlap_ids.push_back(overlap_id);
- all_siblings.push_back(overlap_id);
+ join_overlap_ids.push_back(overlap_id);
+ join_copies_to_do.emplace_back(JoinCopy{false, overlap_id});
return;
}
static constexpr auto options = RelaxedOptions::Size | RelaxedOptions::Format;
const ImageBase new_image_base(new_info, gpu_addr, cpu_addr);
if (IsSubresource(new_info, overlap, gpu_addr, options, broken_views, native_bgr)) {
- left_aliased_ids.push_back(overlap_id);
+ join_left_aliased_ids.push_back(overlap_id);
overlap.flags |= ImageFlagBits::Alias;
- all_siblings.push_back(overlap_id);
+ join_copies_to_do.emplace_back(JoinCopy{true, overlap_id});
} else if (IsSubresource(overlap.info, new_image_base, overlap.gpu_addr, options,
broken_views, native_bgr)) {
- right_aliased_ids.push_back(overlap_id);
+ join_right_aliased_ids.push_back(overlap_id);
overlap.flags |= ImageFlagBits::Alias;
- all_siblings.push_back(overlap_id);
+ join_copies_to_do.emplace_back(JoinCopy{true, overlap_id});
} else {
- bad_overlap_ids.push_back(overlap_id);
- overlap.flags |= ImageFlagBits::BadOverlap;
+ join_bad_overlap_ids.push_back(overlap_id);
}
};
ForEachImageInRegion(cpu_addr, size_bytes, region_check);
const auto region_check_gpu = [&](ImageId overlap_id, ImageBase& overlap) {
- if (!overlaps_found.contains(overlap_id)) {
+ if (!join_overlaps_found.contains(overlap_id)) {
if (True(overlap.flags & ImageFlagBits::Remapped)) {
- ignore_textures.insert(overlap_id);
+ join_ignore_textures.insert(overlap_id);
}
if (overlap.gpu_addr == gpu_addr && overlap.guest_size_bytes == size_bytes) {
- ignore_textures.insert(overlap_id);
+ join_ignore_textures.insert(overlap_id);
}
}
};
@@ -1111,11 +1398,11 @@ ImageId TextureCache<P>::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VA
bool can_rescale = info.rescaleable;
bool any_rescaled = false;
- for (const ImageId sibling_id : all_siblings) {
+ for (const auto& copy : join_copies_to_do) {
if (!can_rescale) {
break;
}
- Image& sibling = slot_images[sibling_id];
+ Image& sibling = slot_images[copy.id];
can_rescale &= ImageCanRescale(sibling);
any_rescaled |= True(sibling.flags & ImageFlagBits::Rescaled);
}
@@ -1123,13 +1410,13 @@ ImageId TextureCache<P>::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VA
can_rescale &= any_rescaled;
if (can_rescale) {
- for (const ImageId sibling_id : all_siblings) {
- Image& sibling = slot_images[sibling_id];
+ for (const auto& copy : join_copies_to_do) {
+ Image& sibling = slot_images[copy.id];
ScaleUp(sibling);
}
} else {
- for (const ImageId sibling_id : all_siblings) {
- Image& sibling = slot_images[sibling_id];
+ for (const auto& copy : join_copies_to_do) {
+ Image& sibling = slot_images[copy.id];
ScaleDown(sibling);
}
}
@@ -1137,11 +1424,11 @@ ImageId TextureCache<P>::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VA
const ImageId new_image_id = slot_images.insert(runtime, new_info, gpu_addr, cpu_addr);
Image& new_image = slot_images[new_image_id];
- if (!gpu_memory->IsContinousRange(new_image.gpu_addr, new_image.guest_size_bytes)) {
+ if (!gpu_memory->IsContinuousRange(new_image.gpu_addr, new_image.guest_size_bytes)) {
new_image.flags |= ImageFlagBits::Sparse;
}
- for (const ImageId overlap_id : ignore_textures) {
+ for (const ImageId overlap_id : join_ignore_textures) {
Image& overlap = slot_images[overlap_id];
if (True(overlap.flags & ImageFlagBits::GpuModified)) {
UNIMPLEMENTED();
@@ -1162,44 +1449,81 @@ ImageId TextureCache<P>::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VA
ScaleDown(new_image);
}
- for (const ImageId overlap_id : overlap_ids) {
- Image& overlap = slot_images[overlap_id];
- if (True(overlap.flags & ImageFlagBits::GpuModified)) {
- new_image.flags |= ImageFlagBits::GpuModified;
- }
- if (overlap.info.num_samples != new_image.info.num_samples) {
- LOG_WARNING(HW_GPU, "Copying between images with different samples is not implemented");
- } else {
- const auto& resolution = Settings::values.resolution_info;
- const SubresourceBase base = new_image.TryFindBase(overlap.gpu_addr).value();
- const u32 up_scale = can_rescale ? resolution.up_scale : 1;
- const u32 down_shift = can_rescale ? resolution.down_shift : 0;
- auto copies = MakeShrinkImageCopies(new_info, overlap.info, base, up_scale, down_shift);
- runtime.CopyImage(new_image, overlap, std::move(copies));
- }
- if (True(overlap.flags & ImageFlagBits::Tracked)) {
- UntrackImage(overlap, overlap_id);
- }
- UnregisterImage(overlap_id);
- DeleteImage(overlap_id);
- }
+ std::ranges::sort(join_copies_to_do, [this](const JoinCopy& lhs, const JoinCopy& rhs) {
+ const ImageBase& lhs_image = slot_images[lhs.id];
+ const ImageBase& rhs_image = slot_images[rhs.id];
+ return lhs_image.modification_tick < rhs_image.modification_tick;
+ });
+
ImageBase& new_image_base = new_image;
- for (const ImageId aliased_id : right_aliased_ids) {
+ for (const ImageId aliased_id : join_right_aliased_ids) {
ImageBase& aliased = slot_images[aliased_id];
- AddImageAlias(new_image_base, aliased, new_image_id, aliased_id);
+ size_t alias_index = new_image_base.aliased_images.size();
+ if (!AddImageAlias(new_image_base, aliased, new_image_id, aliased_id)) {
+ continue;
+ }
+ join_alias_indices.emplace(aliased_id, alias_index);
new_image.flags |= ImageFlagBits::Alias;
}
- for (const ImageId aliased_id : left_aliased_ids) {
+ for (const ImageId aliased_id : join_left_aliased_ids) {
ImageBase& aliased = slot_images[aliased_id];
- AddImageAlias(aliased, new_image_base, aliased_id, new_image_id);
+ size_t alias_index = new_image_base.aliased_images.size();
+ if (!AddImageAlias(aliased, new_image_base, aliased_id, new_image_id)) {
+ continue;
+ }
+ join_alias_indices.emplace(aliased_id, alias_index);
new_image.flags |= ImageFlagBits::Alias;
}
- for (const ImageId aliased_id : bad_overlap_ids) {
+ for (const ImageId aliased_id : join_bad_overlap_ids) {
ImageBase& aliased = slot_images[aliased_id];
aliased.overlapping_images.push_back(new_image_id);
new_image.overlapping_images.push_back(aliased_id);
- new_image.flags |= ImageFlagBits::BadOverlap;
+ if (aliased.info.resources.levels == 1 && aliased.info.block.depth == 0 &&
+ aliased.overlapping_images.size() > 1) {
+ aliased.flags |= ImageFlagBits::BadOverlap;
+ }
+ if (new_image.info.resources.levels == 1 && new_image.info.block.depth == 0 &&
+ new_image.overlapping_images.size() > 1) {
+ new_image.flags |= ImageFlagBits::BadOverlap;
+ }
+ }
+
+ for (const auto& copy_object : join_copies_to_do) {
+ Image& overlap = slot_images[copy_object.id];
+ if (copy_object.is_alias) {
+ if (!overlap.IsSafeDownload()) {
+ continue;
+ }
+ const auto alias_pointer = join_alias_indices.find(copy_object.id);
+ if (alias_pointer == join_alias_indices.end()) {
+ continue;
+ }
+ const AliasedImage& aliased = new_image.aliased_images[alias_pointer->second];
+ CopyImage(new_image_id, aliased.id, aliased.copies);
+ new_image.modification_tick = overlap.modification_tick;
+ continue;
+ }
+ if (True(overlap.flags & ImageFlagBits::GpuModified)) {
+ new_image.flags |= ImageFlagBits::GpuModified;
+ const auto& resolution = Settings::values.resolution_info;
+ const SubresourceBase base = new_image.TryFindBase(overlap.gpu_addr).value();
+ const u32 up_scale = can_rescale ? resolution.up_scale : 1;
+ const u32 down_shift = can_rescale ? resolution.down_shift : 0;
+ auto copies = MakeShrinkImageCopies(new_info, overlap.info, base, up_scale, down_shift);
+ if (overlap.info.num_samples != new_image.info.num_samples) {
+ runtime.CopyImageMSAA(new_image, overlap, std::move(copies));
+ } else {
+ runtime.CopyImage(new_image, overlap, std::move(copies));
+ }
+ new_image.modification_tick = overlap.modification_tick;
+ }
+ if (True(overlap.flags & ImageFlagBits::Tracked)) {
+ UntrackImage(overlap, copy_object.id);
+ }
+ UnregisterImage(copy_object.id);
+ DeleteImage(copy_object.id);
}
+
RegisterImage(new_image_id);
return new_image_id;
}
@@ -1289,6 +1613,63 @@ std::optional<typename TextureCache<P>::BlitImages> TextureCache<P>::GetBlitImag
}
template <class P>
+ImageId TextureCache<P>::FindDMAImage(const ImageInfo& info, GPUVAddr gpu_addr) {
+ std::optional<VAddr> cpu_addr = gpu_memory->GpuToCpuAddress(gpu_addr);
+ if (!cpu_addr) {
+ cpu_addr = gpu_memory->GpuToCpuAddress(gpu_addr, CalculateGuestSizeInBytes(info));
+ if (!cpu_addr) {
+ return ImageId{};
+ }
+ }
+ ImageId image_id{};
+ boost::container::small_vector<ImageId, 8> image_ids;
+ const auto lambda = [&](ImageId existing_image_id, ImageBase& existing_image) {
+ if (True(existing_image.flags & ImageFlagBits::Remapped)) {
+ return false;
+ }
+ if (info.type == ImageType::Linear || existing_image.info.type == ImageType::Linear)
+ [[unlikely]] {
+ const bool strict_size = True(existing_image.flags & ImageFlagBits::Strong);
+ const ImageInfo& existing = existing_image.info;
+ if (existing_image.gpu_addr == gpu_addr && existing.type == info.type &&
+ existing.pitch == info.pitch &&
+ IsPitchLinearSameSize(existing, info, strict_size) &&
+ IsViewCompatible(existing.format, info.format, false, true)) {
+ image_id = existing_image_id;
+ image_ids.push_back(existing_image_id);
+ return true;
+ }
+ } else if (IsSubCopy(info, existing_image, gpu_addr)) {
+ image_id = existing_image_id;
+ image_ids.push_back(existing_image_id);
+ return true;
+ }
+ return false;
+ };
+ ForEachImageInRegion(*cpu_addr, CalculateGuestSizeInBytes(info), lambda);
+ if (image_ids.size() <= 1) [[likely]] {
+ return image_id;
+ }
+ auto image_ids_compare = [this](ImageId a, ImageId b) {
+ auto& image_a = slot_images[a];
+ auto& image_b = slot_images[b];
+ return image_a.modification_tick < image_b.modification_tick;
+ };
+ return *std::ranges::max_element(image_ids, image_ids_compare);
+}
+
+template <class P>
+std::pair<u32, u32> TextureCache<P>::PrepareDmaImage(ImageId dst_id, GPUVAddr base_addr,
+ bool mark_as_modified) {
+ const auto& image = slot_images[dst_id];
+ const auto base = image.TryFindBase(base_addr);
+ PrepareImage(dst_id, mark_as_modified, false);
+ const auto& new_image = slot_images[dst_id];
+ lru_cache.Touch(new_image.lru_index, frame_tick);
+ return std::make_pair(base->level, base->layer);
+}
+
+template <class P>
SamplerId TextureCache<P>::FindSampler(const TSCEntry& config) {
if (std::ranges::all_of(config.raw, [](u64 value) { return value == 0; })) {
return NULL_SAMPLER_ID;
@@ -1301,7 +1682,7 @@ SamplerId TextureCache<P>::FindSampler(const TSCEntry& config) {
}
template <class P>
-ImageViewId TextureCache<P>::FindColorBuffer(size_t index, bool is_clear) {
+ImageViewId TextureCache<P>::FindColorBuffer(size_t index) {
const auto& regs = maxwell3d->regs;
if (index >= regs.rt_control.count) {
return ImageViewId{};
@@ -1314,12 +1695,12 @@ ImageViewId TextureCache<P>::FindColorBuffer(size_t index, bool is_clear) {
if (rt.format == Tegra::RenderTargetFormat::NONE) {
return ImageViewId{};
}
- const ImageInfo info(regs, index);
- return FindRenderTargetView(info, gpu_addr, is_clear);
+ const ImageInfo info(regs.rt[index], regs.anti_alias_samples_mode);
+ return FindRenderTargetView(info, gpu_addr);
}
template <class P>
-ImageViewId TextureCache<P>::FindDepthBuffer(bool is_clear) {
+ImageViewId TextureCache<P>::FindDepthBuffer() {
const auto& regs = maxwell3d->regs;
if (!regs.zeta_enable) {
return ImageViewId{};
@@ -1328,19 +1709,17 @@ ImageViewId TextureCache<P>::FindDepthBuffer(bool is_clear) {
if (gpu_addr == 0) {
return ImageViewId{};
}
- const ImageInfo info(regs);
- return FindRenderTargetView(info, gpu_addr, is_clear);
+ const ImageInfo info(regs.zeta, regs.zeta_size, regs.anti_alias_samples_mode);
+ return FindRenderTargetView(info, gpu_addr);
}
template <class P>
-ImageViewId TextureCache<P>::FindRenderTargetView(const ImageInfo& info, GPUVAddr gpu_addr,
- bool is_clear) {
- const auto options = is_clear ? RelaxedOptions::Samples : RelaxedOptions{};
+ImageViewId TextureCache<P>::FindRenderTargetView(const ImageInfo& info, GPUVAddr gpu_addr) {
ImageId image_id{};
bool delete_state = has_deleted_images;
do {
has_deleted_images = false;
- image_id = FindOrInsertImage(info, gpu_addr, options);
+ image_id = FindOrInsertImage(info, gpu_addr);
delete_state |= has_deleted_images;
} while (has_deleted_images);
has_deleted_images = delete_state;
@@ -1427,37 +1806,38 @@ void TextureCache<P>::ForEachImageInRegionGPU(size_t as_id, GPUVAddr gpu_addr, s
return;
}
auto& gpu_page_table = gpu_page_table_storage[*storage_id];
- ForEachGPUPage(gpu_addr, size, [this, gpu_page_table, &images, gpu_addr, size, func](u64 page) {
- const auto it = gpu_page_table.find(page);
- if (it == gpu_page_table.end()) {
- if constexpr (BOOL_BREAK) {
- return false;
- } else {
- return;
- }
- }
- for (const ImageId image_id : it->second) {
- Image& image = slot_images[image_id];
- if (True(image.flags & ImageFlagBits::Picked)) {
- continue;
- }
- if (!image.OverlapsGPU(gpu_addr, size)) {
- continue;
- }
- image.flags |= ImageFlagBits::Picked;
- images.push_back(image_id);
- if constexpr (BOOL_BREAK) {
- if (func(image_id, image)) {
- return true;
- }
- } else {
- func(image_id, image);
- }
- }
- if constexpr (BOOL_BREAK) {
- return false;
- }
- });
+ ForEachGPUPage(gpu_addr, size,
+ [this, &gpu_page_table, &images, gpu_addr, size, func](u64 page) {
+ const auto it = gpu_page_table.find(page);
+ if (it == gpu_page_table.end()) {
+ if constexpr (BOOL_BREAK) {
+ return false;
+ } else {
+ return;
+ }
+ }
+ for (const ImageId image_id : it->second) {
+ Image& image = slot_images[image_id];
+ if (True(image.flags & ImageFlagBits::Picked)) {
+ continue;
+ }
+ if (!image.OverlapsGPU(gpu_addr, size)) {
+ continue;
+ }
+ image.flags |= ImageFlagBits::Picked;
+ images.push_back(image_id);
+ if constexpr (BOOL_BREAK) {
+ if (func(image_id, image)) {
+ return true;
+ }
+ } else {
+ func(image_id, image);
+ }
+ }
+ if constexpr (BOOL_BREAK) {
+ return false;
+ }
+ });
for (const ImageId image_id : images) {
slot_images[image_id].flags &= ~ImageFlagBits::Picked;
}
@@ -1549,10 +1929,6 @@ void TextureCache<P>::RegisterImage(ImageId image_id) {
tentative_size = EstimatedDecompressedSize(tentative_size, image.info.format);
}
total_used_memory += Common::AlignUp(tentative_size, 1024);
- if (total_used_memory > critical_memory && critical_gc < GC_EMERGENCY_COUNTS) {
- RunGarbageCollector();
- critical_gc++;
- }
image.lru_index = lru_cache.Insert(image_id, frame_tick);
ForEachGPUPage(image.gpu_addr, image.guest_size_bytes, [this, image_id](u64 page) {
@@ -1566,7 +1942,7 @@ void TextureCache<P>::RegisterImage(ImageId image_id) {
image.map_view_id = map_id;
return;
}
- std::vector<ImageViewId> sparse_maps{};
+ boost::container::small_vector<ImageViewId, 16> sparse_maps;
ForEachSparseSegment(
image, [this, image_id, &sparse_maps](GPUVAddr gpu_addr, VAddr cpu_addr, size_t size) {
auto map_id = slot_map_views.insert(gpu_addr, cpu_addr, size, image_id);
@@ -1841,7 +2217,7 @@ void TextureCache<P>::MarkModification(ImageBase& image) noexcept {
template <class P>
void TextureCache<P>::SynchronizeAliases(ImageId image_id) {
- boost::container::small_vector<const AliasedImage*, 1> aliased_images;
+ boost::container::small_vector<const AliasedImage*, 8> aliased_images;
Image& image = slot_images[image_id];
bool any_rescaled = True(image.flags & ImageFlagBits::Rescaled);
bool any_modified = True(image.flags & ImageFlagBits::GpuModified);
@@ -2019,7 +2395,8 @@ void TextureCache<P>::BindRenderTarget(ImageViewId* old_id, ImageViewId new_id)
if (new_id) {
const ImageViewBase& old_view = slot_image_views[new_id];
if (True(old_view.flags & ImageViewFlagBits::PreemtiveDownload)) {
- uncommitted_downloads.push_back(old_view.image_id);
+ const PendingDownload new_download{true, 0, old_view.image_id};
+ uncommitted_downloads.emplace_back(new_download);
}
}
*old_id = new_id;