diff options
Diffstat (limited to 'src/video_core')
50 files changed, 852 insertions, 279 deletions
diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index 06fd40851..1f656ffa8 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -55,6 +55,19 @@ constexpr u32 NUM_STORAGE_BUFFERS = 16; constexpr u32 NUM_TEXTURE_BUFFERS = 16; constexpr u32 NUM_STAGES = 5; +enum class ObtainBufferSynchronize : u32 { + NoSynchronize = 0, + FullSynchronize = 1, + SynchronizeNoDirty = 2, +}; + +enum class ObtainBufferOperation : u32 { + DoNothing = 0, + MarkAsWritten = 1, + DiscardWrite = 2, + MarkQuery = 3, +}; + using UniformBufferSizes = std::array<std::array<u32, NUM_GRAPHICS_UNIFORM_BUFFERS>, NUM_STAGES>; using ComputeUniformBufferSizes = std::array<u32, NUM_COMPUTE_UNIFORM_BUFFERS>; @@ -191,6 +204,10 @@ public: bool DMAClear(GPUVAddr src_address, u64 amount, u32 value); + [[nodiscard]] std::pair<Buffer*, u32> ObtainBuffer(GPUVAddr gpu_addr, u32 size, + ObtainBufferSynchronize sync_info, + ObtainBufferOperation post_op); + /// Return true when a CPU region is modified from the GPU [[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size); @@ -366,7 +383,8 @@ private: void NotifyBufferDeletion(); - [[nodiscard]] Binding StorageBufferBinding(GPUVAddr ssbo_addr, bool is_written = false) const; + [[nodiscard]] Binding StorageBufferBinding(GPUVAddr ssbo_addr, u32 cbuf_index, + bool is_written = false) const; [[nodiscard]] TextureBufferBinding GetTextureBufferBinding(GPUVAddr gpu_addr, u32 size, PixelFormat format); @@ -642,6 +660,42 @@ bool BufferCache<P>::DMAClear(GPUVAddr dst_address, u64 amount, u32 value) { } template <class P> +std::pair<typename P::Buffer*, u32> BufferCache<P>::ObtainBuffer(GPUVAddr gpu_addr, u32 size, + ObtainBufferSynchronize sync_info, + ObtainBufferOperation post_op) { + const std::optional<VAddr> cpu_addr = gpu_memory->GpuToCpuAddress(gpu_addr); + if (!cpu_addr) { + return {&slot_buffers[NULL_BUFFER_ID], 0}; + } + const BufferId buffer_id = FindBuffer(*cpu_addr, size); + Buffer& buffer = slot_buffers[buffer_id]; + + // synchronize op + switch (sync_info) { + case ObtainBufferSynchronize::FullSynchronize: + SynchronizeBuffer(buffer, *cpu_addr, size); + break; + default: + break; + } + + switch (post_op) { + case ObtainBufferOperation::MarkAsWritten: + MarkWrittenBuffer(buffer_id, *cpu_addr, size); + break; + case ObtainBufferOperation::DiscardWrite: { + IntervalType interval{*cpu_addr, size}; + ClearDownload(interval); + break; + } + default: + break; + } + + return {&buffer, buffer.Offset(*cpu_addr)}; +} + +template <class P> void BufferCache<P>::BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size) { const std::optional<VAddr> cpu_addr = gpu_memory->GpuToCpuAddress(gpu_addr); @@ -749,7 +803,7 @@ void BufferCache<P>::BindGraphicsStorageBuffer(size_t stage, size_t ssbo_index, const auto& cbufs = maxwell3d->state.shader_stages[stage]; const GPUVAddr ssbo_addr = cbufs.const_buffers[cbuf_index].address + cbuf_offset; - storage_buffers[stage][ssbo_index] = StorageBufferBinding(ssbo_addr, is_written); + storage_buffers[stage][ssbo_index] = StorageBufferBinding(ssbo_addr, cbuf_index, is_written); } template <class P> @@ -789,7 +843,7 @@ void BufferCache<P>::BindComputeStorageBuffer(size_t ssbo_index, u32 cbuf_index, const auto& cbufs = launch_desc.const_buffer_config; const GPUVAddr ssbo_addr = cbufs[cbuf_index].Address() + cbuf_offset; - compute_storage_buffers[ssbo_index] = StorageBufferBinding(ssbo_addr, is_written); + compute_storage_buffers[ssbo_index] = StorageBufferBinding(ssbo_addr, cbuf_index, is_written); } template <class P> @@ -1935,11 +1989,26 @@ void BufferCache<P>::NotifyBufferDeletion() { template <class P> typename BufferCache<P>::Binding BufferCache<P>::StorageBufferBinding(GPUVAddr ssbo_addr, + u32 cbuf_index, bool is_written) const { const GPUVAddr gpu_addr = gpu_memory->Read<u64>(ssbo_addr); - const u32 size = gpu_memory->Read<u32>(ssbo_addr + 8); + const auto size = [&]() { + const bool is_nvn_cbuf = cbuf_index == 0; + // The NVN driver buffer (index 0) is known to pack the SSBO address followed by its size. + if (is_nvn_cbuf) { + return gpu_memory->Read<u32>(ssbo_addr + 8); + } + // Other titles (notably Doom Eternal) may use STG/LDG on buffer addresses in custom defined + // cbufs, which do not store the sizes adjacent to the addresses, so use the fully + // mapped buffer size for now. + const u32 memory_layout_size = static_cast<u32>(gpu_memory->GetMemoryLayoutSize(gpu_addr)); + LOG_INFO(HW_GPU, "Binding storage buffer for cbuf index {}, MemoryLayoutSize 0x{:X}", + cbuf_index, memory_layout_size); + return memory_layout_size; + }(); const std::optional<VAddr> cpu_addr = gpu_memory->GpuToCpuAddress(gpu_addr); if (!cpu_addr || size == 0) { + LOG_WARNING(HW_GPU, "Failed to find storage buffer for cbuf index {}", cbuf_index); return NULL_BINDING; } const VAddr cpu_end = Common::AlignUp(*cpu_addr + size, Core::Memory::YUZU_PAGESIZE); diff --git a/src/video_core/control/channel_state_cache.h b/src/video_core/control/channel_state_cache.h index cdaf4f8d5..46bc9e322 100644 --- a/src/video_core/control/channel_state_cache.h +++ b/src/video_core/control/channel_state_cache.h @@ -44,7 +44,7 @@ public: template <class P> class ChannelSetupCaches { public: - /// Operations for seting the channel of execution. + /// Operations for setting the channel of execution. virtual ~ChannelSetupCaches(); /// Create channel state. diff --git a/src/video_core/engines/draw_manager.cpp b/src/video_core/engines/draw_manager.cpp index 1d22d25f1..0e94c521a 100644 --- a/src/video_core/engines/draw_manager.cpp +++ b/src/video_core/engines/draw_manager.cpp @@ -164,6 +164,7 @@ void DrawManager::DrawEnd(u32 instance_count, bool force_draw) { draw_state.index_buffer.count = static_cast<u32>(draw_state.inline_index_draw_indexes.size() / 4); draw_state.index_buffer.format = Maxwell3D::Regs::IndexFormat::UnsignedInt; + maxwell3d->dirty.flags[VideoCommon::Dirty::IndexBuffer] = true; ProcessDraw(true, instance_count); draw_state.inline_index_draw_indexes.clear(); break; diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp index 7762c7d96..e68850dc5 100644 --- a/src/video_core/engines/maxwell_dma.cpp +++ b/src/video_core/engines/maxwell_dma.cpp @@ -14,7 +14,13 @@ #include "video_core/textures/decoders.h" MICROPROFILE_DECLARE(GPU_DMAEngine); +MICROPROFILE_DECLARE(GPU_DMAEngineBL); +MICROPROFILE_DECLARE(GPU_DMAEngineLB); +MICROPROFILE_DECLARE(GPU_DMAEngineBB); MICROPROFILE_DEFINE(GPU_DMAEngine, "GPU", "DMA Engine", MP_RGB(224, 224, 128)); +MICROPROFILE_DEFINE(GPU_DMAEngineBL, "GPU", "DMA Engine Block - Linear", MP_RGB(224, 224, 128)); +MICROPROFILE_DEFINE(GPU_DMAEngineLB, "GPU", "DMA Engine Linear - Block", MP_RGB(224, 224, 128)); +MICROPROFILE_DEFINE(GPU_DMAEngineBB, "GPU", "DMA Engine Block - Block", MP_RGB(224, 224, 128)); namespace Tegra::Engines { @@ -72,6 +78,7 @@ void MaxwellDMA::Launch() { memory_manager.FlushCaching(); if (!is_src_pitch && !is_dst_pitch) { // If both the source and the destination are in block layout, assert. + MICROPROFILE_SCOPE(GPU_DMAEngineBB); CopyBlockLinearToBlockLinear(); ReleaseSemaphore(); return; @@ -87,8 +94,10 @@ void MaxwellDMA::Launch() { } } else { if (!is_src_pitch && is_dst_pitch) { + MICROPROFILE_SCOPE(GPU_DMAEngineBL); CopyBlockLinearToPitch(); } else { + MICROPROFILE_SCOPE(GPU_DMAEngineLB); CopyPitchToBlockLinear(); } } @@ -153,21 +162,35 @@ void MaxwellDMA::Launch() { } void MaxwellDMA::CopyBlockLinearToPitch() { - UNIMPLEMENTED_IF(regs.src_params.block_size.width != 0); - UNIMPLEMENTED_IF(regs.src_params.layer != 0); - - const bool is_remapping = regs.launch_dma.remap_enable != 0; - - // Optimized path for micro copies. - const size_t dst_size = static_cast<size_t>(regs.pitch_out) * regs.line_count; - if (!is_remapping && dst_size < GOB_SIZE && regs.pitch_out <= GOB_SIZE_X && - regs.src_params.height > GOB_SIZE_Y) { - FastCopyBlockLinearToPitch(); + UNIMPLEMENTED_IF(regs.launch_dma.remap_enable != 0); + + u32 bytes_per_pixel = 1; + DMA::ImageOperand src_operand; + src_operand.bytes_per_pixel = bytes_per_pixel; + src_operand.params = regs.src_params; + src_operand.address = regs.offset_in; + + DMA::BufferOperand dst_operand; + dst_operand.pitch = regs.pitch_out; + dst_operand.width = regs.line_length_in; + dst_operand.height = regs.line_count; + dst_operand.address = regs.offset_out; + DMA::ImageCopy copy_info{}; + copy_info.length_x = regs.line_length_in; + copy_info.length_y = regs.line_count; + auto& accelerate = rasterizer->AccessAccelerateDMA(); + if (accelerate.ImageToBuffer(copy_info, src_operand, dst_operand)) { return; } + UNIMPLEMENTED_IF(regs.src_params.block_size.width != 0); + UNIMPLEMENTED_IF(regs.src_params.block_size.depth != 0); + UNIMPLEMENTED_IF(regs.src_params.block_size.depth == 0 && regs.src_params.depth != 1); + // Deswizzle the input and copy it over. - const Parameters& src_params = regs.src_params; + const DMA::Parameters& src_params = regs.src_params; + + const bool is_remapping = regs.launch_dma.remap_enable != 0; const u32 num_remap_components = regs.remap_const.num_dst_components_minus_one + 1; const u32 remap_components_size = regs.remap_const.component_size_minus_one + 1; @@ -187,7 +210,7 @@ void MaxwellDMA::CopyBlockLinearToPitch() { x_offset >>= bpp_shift; } - const u32 bytes_per_pixel = base_bpp << bpp_shift; + bytes_per_pixel = base_bpp << bpp_shift; const u32 height = src_params.height; const u32 depth = src_params.depth; const u32 block_height = src_params.block_size.height; @@ -195,11 +218,12 @@ void MaxwellDMA::CopyBlockLinearToPitch() { const size_t src_size = CalculateSize(true, bytes_per_pixel, width, height, depth, block_height, block_depth); + const size_t dst_size = static_cast<size_t>(regs.pitch_out) * regs.line_count; read_buffer.resize_destructive(src_size); write_buffer.resize_destructive(dst_size); - memory_manager.ReadBlock(regs.offset_in, read_buffer.data(), src_size); - memory_manager.ReadBlock(regs.offset_out, write_buffer.data(), dst_size); + memory_manager.ReadBlock(src_operand.address, read_buffer.data(), src_size); + memory_manager.ReadBlockUnsafe(dst_operand.address, write_buffer.data(), dst_size); UnswizzleSubrect(write_buffer, read_buffer, bytes_per_pixel, width, height, depth, x_offset, src_params.origin.y, x_elements, regs.line_count, block_height, block_depth, @@ -216,6 +240,24 @@ void MaxwellDMA::CopyPitchToBlockLinear() { const u32 num_remap_components = regs.remap_const.num_dst_components_minus_one + 1; const u32 remap_components_size = regs.remap_const.component_size_minus_one + 1; + u32 bytes_per_pixel = 1; + DMA::ImageOperand dst_operand; + dst_operand.bytes_per_pixel = bytes_per_pixel; + dst_operand.params = regs.dst_params; + dst_operand.address = regs.offset_out; + DMA::BufferOperand src_operand; + src_operand.pitch = regs.pitch_in; + src_operand.width = regs.line_length_in; + src_operand.height = regs.line_count; + src_operand.address = regs.offset_in; + DMA::ImageCopy copy_info{}; + copy_info.length_x = regs.line_length_in; + copy_info.length_y = regs.line_count; + auto& accelerate = rasterizer->AccessAccelerateDMA(); + if (accelerate.BufferToImage(copy_info, src_operand, dst_operand)) { + return; + } + const auto& dst_params = regs.dst_params; const u32 base_bpp = !is_remapping ? 1U : num_remap_components * remap_components_size; @@ -233,7 +275,7 @@ void MaxwellDMA::CopyPitchToBlockLinear() { x_offset >>= bpp_shift; } - const u32 bytes_per_pixel = base_bpp << bpp_shift; + bytes_per_pixel = base_bpp << bpp_shift; const u32 height = dst_params.height; const u32 depth = dst_params.depth; const u32 block_height = dst_params.block_size.height; @@ -260,45 +302,14 @@ void MaxwellDMA::CopyPitchToBlockLinear() { memory_manager.WriteBlockCached(regs.offset_out, write_buffer.data(), dst_size); } -void MaxwellDMA::FastCopyBlockLinearToPitch() { - const u32 bytes_per_pixel = 1U; - const size_t src_size = GOB_SIZE; - const size_t dst_size = static_cast<size_t>(regs.pitch_out) * regs.line_count; - u32 pos_x = regs.src_params.origin.x; - u32 pos_y = regs.src_params.origin.y; - const u64 offset = GetGOBOffset(regs.src_params.width, regs.src_params.height, pos_x, pos_y, - regs.src_params.block_size.height, bytes_per_pixel); - const u32 x_in_gob = 64 / bytes_per_pixel; - pos_x = pos_x % x_in_gob; - pos_y = pos_y % 8; - - read_buffer.resize_destructive(src_size); - write_buffer.resize_destructive(dst_size); - - if (Settings::IsGPULevelExtreme()) { - memory_manager.ReadBlock(regs.offset_in + offset, read_buffer.data(), src_size); - memory_manager.ReadBlock(regs.offset_out, write_buffer.data(), dst_size); - } else { - memory_manager.ReadBlockUnsafe(regs.offset_in + offset, read_buffer.data(), src_size); - memory_manager.ReadBlockUnsafe(regs.offset_out, write_buffer.data(), dst_size); - } - - UnswizzleSubrect(write_buffer, read_buffer, bytes_per_pixel, regs.src_params.width, - regs.src_params.height, 1, pos_x, pos_y, regs.line_length_in, regs.line_count, - regs.src_params.block_size.height, regs.src_params.block_size.depth, - regs.pitch_out); - - memory_manager.WriteBlockCached(regs.offset_out, write_buffer.data(), dst_size); -} - void MaxwellDMA::CopyBlockLinearToBlockLinear() { UNIMPLEMENTED_IF(regs.src_params.block_size.width != 0); const bool is_remapping = regs.launch_dma.remap_enable != 0; // Deswizzle the input and copy it over. - const Parameters& src = regs.src_params; - const Parameters& dst = regs.dst_params; + const DMA::Parameters& src = regs.src_params; + const DMA::Parameters& dst = regs.dst_params; const u32 num_remap_components = regs.remap_const.num_dst_components_minus_one + 1; const u32 remap_components_size = regs.remap_const.component_size_minus_one + 1; diff --git a/src/video_core/engines/maxwell_dma.h b/src/video_core/engines/maxwell_dma.h index 0e594fa74..69e26cb32 100644 --- a/src/video_core/engines/maxwell_dma.h +++ b/src/video_core/engines/maxwell_dma.h @@ -24,6 +24,54 @@ namespace VideoCore { class RasterizerInterface; } +namespace Tegra { +namespace DMA { + +union Origin { + BitField<0, 16, u32> x; + BitField<16, 16, u32> y; +}; +static_assert(sizeof(Origin) == 4); + +struct ImageCopy { + u32 length_x{}; + u32 length_y{}; +}; + +union BlockSize { + BitField<0, 4, u32> width; + BitField<4, 4, u32> height; + BitField<8, 4, u32> depth; + BitField<12, 4, u32> gob_height; +}; +static_assert(sizeof(BlockSize) == 4); + +struct Parameters { + BlockSize block_size; + u32 width; + u32 height; + u32 depth; + u32 layer; + Origin origin; +}; +static_assert(sizeof(Parameters) == 24); + +struct ImageOperand { + u32 bytes_per_pixel; + Parameters params; + GPUVAddr address; +}; + +struct BufferOperand { + u32 pitch; + u32 width; + u32 height; + GPUVAddr address; +}; + +} // namespace DMA +} // namespace Tegra + namespace Tegra::Engines { class AccelerateDMAInterface { @@ -32,6 +80,12 @@ public: virtual bool BufferCopy(GPUVAddr src_address, GPUVAddr dest_address, u64 amount) = 0; virtual bool BufferClear(GPUVAddr src_address, u64 amount, u32 value) = 0; + + virtual bool ImageToBuffer(const DMA::ImageCopy& copy_info, const DMA::ImageOperand& src, + const DMA::BufferOperand& dst) = 0; + + virtual bool BufferToImage(const DMA::ImageCopy& copy_info, const DMA::BufferOperand& src, + const DMA::ImageOperand& dst) = 0; }; /** @@ -51,30 +105,6 @@ public: } }; - union BlockSize { - BitField<0, 4, u32> width; - BitField<4, 4, u32> height; - BitField<8, 4, u32> depth; - BitField<12, 4, u32> gob_height; - }; - static_assert(sizeof(BlockSize) == 4); - - union Origin { - BitField<0, 16, u32> x; - BitField<16, 16, u32> y; - }; - static_assert(sizeof(Origin) == 4); - - struct Parameters { - BlockSize block_size; - u32 width; - u32 height; - u32 depth; - u32 layer; - Origin origin; - }; - static_assert(sizeof(Parameters) == 24); - struct Semaphore { PackedGPUVAddr address; u32 payload; @@ -227,8 +257,6 @@ private: void CopyBlockLinearToBlockLinear(); - void FastCopyBlockLinearToPitch(); - void ReleaseSemaphore(); void ConsumeSinkImpl() override; @@ -261,17 +289,17 @@ private: u32 reserved05[0x3f]; PackedGPUVAddr offset_in; PackedGPUVAddr offset_out; - u32 pitch_in; - u32 pitch_out; + s32 pitch_in; + s32 pitch_out; u32 line_length_in; u32 line_count; u32 reserved06[0xb6]; u32 remap_consta_value; u32 remap_constb_value; RemapConst remap_const; - Parameters dst_params; + DMA::Parameters dst_params; u32 reserved07[0x1]; - Parameters src_params; + DMA::Parameters src_params; u32 reserved08[0x275]; u32 pm_trigger_end; u32 reserved09[0x3ba]; diff --git a/src/video_core/engines/sw_blitter/blitter.cpp b/src/video_core/engines/sw_blitter/blitter.cpp index 2f1ea4626..3c9f38559 100644 --- a/src/video_core/engines/sw_blitter/blitter.cpp +++ b/src/video_core/engines/sw_blitter/blitter.cpp @@ -193,7 +193,7 @@ bool SoftwareBlitEngine::Blit(Fermi2D::Surface& src, Fermi2D::Surface& dst, output_converter->ConvertFrom(impl->intermediate_dst, impl->dst_buffer); }; - // Do actuall Blit + // Do actual Blit impl->dst_buffer.resize(dst_copy_size); if (src.linear == Fermi2D::MemoryLayout::BlockLinear) { diff --git a/src/video_core/framebuffer_config.h b/src/video_core/framebuffer_config.h index d93f5a37f..5f3bffcab 100644 --- a/src/video_core/framebuffer_config.h +++ b/src/video_core/framebuffer_config.h @@ -5,8 +5,8 @@ #include "common/common_types.h" #include "common/math_util.h" -#include "core/hle/service/nvflinger/buffer_transform_flags.h" -#include "core/hle/service/nvflinger/pixel_format.h" +#include "core/hle/service/nvnflinger/buffer_transform_flags.h" +#include "core/hle/service/nvnflinger/pixel_format.h" namespace Tegra { diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp index 7024a19cf..2e7f9c5ed 100644 --- a/src/video_core/gpu.cpp +++ b/src/video_core/gpu.cpp @@ -197,7 +197,7 @@ struct GPU::Impl { constexpr u64 gpu_ticks_num = 384; constexpr u64 gpu_ticks_den = 625; - u64 nanoseconds = system.CoreTiming().GetGlobalTimeNs().count(); + u64 nanoseconds = system.CoreTiming().GetCPUTimeNs().count(); if (Settings::values.use_fast_gpu_time.GetValue()) { nanoseconds /= 256; } diff --git a/src/video_core/gpu_thread.cpp b/src/video_core/gpu_thread.cpp index 7cc5647e9..f52f9e28f 100644 --- a/src/video_core/gpu_thread.cpp +++ b/src/video_core/gpu_thread.cpp @@ -25,7 +25,7 @@ static void RunThread(std::stop_token stop_token, Core::System& system, SCOPE_EXIT({ MicroProfileOnThreadExit(); }); Common::SetCurrentThreadName(name.c_str()); - Common::SetCurrentThreadPriority(Common::ThreadPriority::High); + Common::SetCurrentThreadPriority(Common::ThreadPriority::Critical); system.RegisterHostThread(); auto current_context = context.Acquire(); diff --git a/src/video_core/host_shaders/astc_decoder.comp b/src/video_core/host_shaders/astc_decoder.comp index d608678a3..bf2693559 100644 --- a/src/video_core/host_shaders/astc_decoder.comp +++ b/src/video_core/host_shaders/astc_decoder.comp @@ -125,7 +125,7 @@ uvec4 local_buff; uvec4 color_endpoint_data; int color_bitsread = 0; -// Four values, two endpoints, four maximum paritions +// Four values, two endpoints, four maximum partitions uint color_values[32]; int colvals_index = 0; diff --git a/src/video_core/host_shaders/opengl_smaa.glsl b/src/video_core/host_shaders/opengl_smaa.glsl index 3cbe87bbf..419f89bca 100644 --- a/src/video_core/host_shaders/opengl_smaa.glsl +++ b/src/video_core/host_shaders/opengl_smaa.glsl @@ -97,7 +97,7 @@ * half-rate linear filtering on GCN. * * If SMAA is applied to 64-bit color buffers, switching to point filtering - * when accesing them will increase the performance. Search for + * when accessing them will increase the performance. Search for * 'SMAASamplePoint' to see which textures may benefit from point * filtering, and where (which is basically the color input in the edge * detection and resolve passes). diff --git a/src/video_core/memory_manager.h b/src/video_core/memory_manager.h index cf56392ef..51ae2de68 100644 --- a/src/video_core/memory_manager.h +++ b/src/video_core/memory_manager.h @@ -103,8 +103,8 @@ public: /** * Returns a vector with all the subranges of cpu addresses mapped beneath. - * if the region is continous, a single pair will be returned. If it's unmapped, an empty vector - * will be returned; + * if the region is continuous, a single pair will be returned. If it's unmapped, an empty + * vector will be returned; */ std::vector<std::pair<GPUVAddr, std::size_t>> GetSubmappedRange(GPUVAddr gpu_addr, std::size_t size) const; diff --git a/src/video_core/query_cache.h b/src/video_core/query_cache.h index 00ce53e3e..8906ba6d8 100644 --- a/src/video_core/query_cache.h +++ b/src/video_core/query_cache.h @@ -341,7 +341,7 @@ public: /// Flushes the query to guest memory. virtual void Flush() { - // When counter is nullptr it means that it's just been reseted. We are supposed to write a + // When counter is nullptr it means that it's just been reset. We are supposed to write a // zero in these cases. const u64 value = counter ? counter->Query() : 0; std::memcpy(host_ptr, &value, sizeof(u64)); diff --git a/src/video_core/renderer_null/null_rasterizer.h b/src/video_core/renderer_null/null_rasterizer.h index 51f896e43..0c59e6a1f 100644 --- a/src/video_core/renderer_null/null_rasterizer.h +++ b/src/video_core/renderer_null/null_rasterizer.h @@ -22,6 +22,14 @@ public: explicit AccelerateDMA(); bool BufferCopy(GPUVAddr start_address, GPUVAddr end_address, u64 amount) override; bool BufferClear(GPUVAddr src_address, u64 amount, u32 value) override; + bool ImageToBuffer(const Tegra::DMA::ImageCopy& copy_info, const Tegra::DMA::ImageOperand& src, + const Tegra::DMA::BufferOperand& dst) override { + return false; + } + bool BufferToImage(const Tegra::DMA::ImageCopy& copy_info, const Tegra::DMA::BufferOperand& src, + const Tegra::DMA::ImageOperand& dst) override { + return false; + } }; class RasterizerNull final : public VideoCore::RasterizerAccelerated, diff --git a/src/video_core/renderer_opengl/blit_image.cpp b/src/video_core/renderer_opengl/blit_image.cpp index 9a560a73b..3b03e8d5a 100644 --- a/src/video_core/renderer_opengl/blit_image.cpp +++ b/src/video_core/renderer_opengl/blit_image.cpp @@ -22,7 +22,7 @@ BlitImageHelper::~BlitImageHelper() = default; void BlitImageHelper::BlitColor(GLuint dst_framebuffer, GLuint src_image_view, GLuint src_sampler, const Region2D& dst_region, const Region2D& src_region, const Extent3D& src_size) { - glEnable(GL_CULL_FACE); + glDisable(GL_CULL_FACE); glDisable(GL_COLOR_LOGIC_OP); glDisable(GL_DEPTH_TEST); glDisable(GL_STENCIL_TEST); @@ -31,7 +31,6 @@ void BlitImageHelper::BlitColor(GLuint dst_framebuffer, GLuint src_image_view, G glDisable(GL_ALPHA_TEST); glDisablei(GL_BLEND, 0); glPolygonMode(GL_FRONT_AND_BACK, GL_FILL); - glCullFace(GL_BACK); glFrontFace(GL_CW); glColorMaski(0, GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE); glDepthRangeIndexed(0, 0.0, 0.0); diff --git a/src/video_core/renderer_opengl/gl_fence_manager.cpp b/src/video_core/renderer_opengl/gl_fence_manager.cpp index 91463f854..5326172af 100644 --- a/src/video_core/renderer_opengl/gl_fence_manager.cpp +++ b/src/video_core/renderer_opengl/gl_fence_manager.cpp @@ -27,9 +27,7 @@ bool GLInnerFence::IsSignaled() const { return true; } ASSERT(sync_object.handle != 0); - GLint sync_status; - glGetSynciv(sync_object.handle, GL_SYNC_STATUS, 1, nullptr, &sync_status); - return sync_status == GL_SIGNALED; + return sync_object.IsSignaled(); } void GLInnerFence::Wait() { diff --git a/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp b/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp index 29491e762..89000d6e0 100644 --- a/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp +++ b/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp @@ -621,10 +621,7 @@ bool GraphicsPipeline::IsBuilt() noexcept { if (built_fence.handle == 0) { return false; } - // Timeout of zero means this is non-blocking - const auto sync_status = glClientWaitSync(built_fence.handle, 0, 0); - ASSERT(sync_status != GL_WAIT_FAILED); - is_built = sync_status != GL_TIMEOUT_EXPIRED; + is_built = built_fence.IsSignaled(); return is_built; } diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index 7bced675c..90e35e307 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -63,7 +63,7 @@ RasterizerOpenGL::RasterizerOpenGL(Core::Frontend::EmuWindow& emu_window_, Tegra buffer_cache(*this, cpu_memory_, buffer_cache_runtime), shader_cache(*this, emu_window_, device, texture_cache, buffer_cache, program_manager, state_tracker, gpu.ShaderNotify()), - query_cache(*this), accelerate_dma(buffer_cache), + query_cache(*this), accelerate_dma(buffer_cache, texture_cache), fence_manager(*this, gpu, texture_cache, buffer_cache, query_cache), blit_image(program_manager_) {} @@ -357,6 +357,7 @@ void RasterizerOpenGL::DrawTexture() { .y = static_cast<s32>(draw_texture_state.src_y1)}}; blit_image.BlitColor(texture_cache.GetFramebuffer()->Handle(), texture.DefaultHandle(), sampler->Handle(), dst_region, src_region, texture.size); + state_tracker.InvalidateState(); } ++num_queued_commands; @@ -576,7 +577,7 @@ bool RasterizerOpenGL::AccelerateConditionalRendering() { // Reimplement Host conditional rendering. return false; } - // Medium / Low Hack: stub any checks on queries writen into the buffer cache. + // Medium / Low Hack: stub any checks on queries written into the buffer cache. const GPUVAddr condition_address{maxwell3d->regs.render_enable.Address()}; Maxwell::ReportSemaphore::Compare cmp; if (gpu_memory->IsMemoryDirty(condition_address, sizeof(cmp), @@ -1262,7 +1263,8 @@ void RasterizerOpenGL::ReleaseChannel(s32 channel_id) { query_cache.EraseChannel(channel_id); } -AccelerateDMA::AccelerateDMA(BufferCache& buffer_cache_) : buffer_cache{buffer_cache_} {} +AccelerateDMA::AccelerateDMA(BufferCache& buffer_cache_, TextureCache& texture_cache_) + : buffer_cache{buffer_cache_}, texture_cache{texture_cache_} {} bool AccelerateDMA::BufferCopy(GPUVAddr src_address, GPUVAddr dest_address, u64 amount) { std::scoped_lock lock{buffer_cache.mutex}; @@ -1274,4 +1276,44 @@ bool AccelerateDMA::BufferClear(GPUVAddr src_address, u64 amount, u32 value) { return buffer_cache.DMAClear(src_address, amount, value); } +template <bool IS_IMAGE_UPLOAD> +bool AccelerateDMA::DmaBufferImageCopy(const Tegra::DMA::ImageCopy& copy_info, + const Tegra::DMA::BufferOperand& buffer_operand, + const Tegra::DMA::ImageOperand& image_operand) { + std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex}; + const auto image_id = texture_cache.DmaImageId(image_operand); + if (image_id == VideoCommon::NULL_IMAGE_ID) { + return false; + } + const u32 buffer_size = static_cast<u32>(buffer_operand.pitch * buffer_operand.height); + static constexpr auto sync_info = VideoCommon::ObtainBufferSynchronize::FullSynchronize; + const auto post_op = IS_IMAGE_UPLOAD ? VideoCommon::ObtainBufferOperation::DoNothing + : VideoCommon::ObtainBufferOperation::MarkAsWritten; + const auto [buffer, offset] = + buffer_cache.ObtainBuffer(buffer_operand.address, buffer_size, sync_info, post_op); + + const auto [image, copy] = texture_cache.DmaBufferImageCopy( + copy_info, buffer_operand, image_operand, image_id, IS_IMAGE_UPLOAD); + const std::span copy_span{©, 1}; + + if constexpr (IS_IMAGE_UPLOAD) { + image->UploadMemory(buffer->Handle(), offset, copy_span); + } else { + image->DownloadMemory(buffer->Handle(), offset, copy_span); + } + return true; +} + +bool AccelerateDMA::ImageToBuffer(const Tegra::DMA::ImageCopy& copy_info, + const Tegra::DMA::ImageOperand& image_operand, + const Tegra::DMA::BufferOperand& buffer_operand) { + return DmaBufferImageCopy<false>(copy_info, buffer_operand, image_operand); +} + +bool AccelerateDMA::BufferToImage(const Tegra::DMA::ImageCopy& copy_info, + const Tegra::DMA::BufferOperand& buffer_operand, + const Tegra::DMA::ImageOperand& image_operand) { + return DmaBufferImageCopy<true>(copy_info, buffer_operand, image_operand); +} + } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h index 0c45832ae..ad6978bd0 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.h +++ b/src/video_core/renderer_opengl/gl_rasterizer.h @@ -50,14 +50,26 @@ static_assert(sizeof(BindlessSSBO) * CHAR_BIT == 128); class AccelerateDMA : public Tegra::Engines::AccelerateDMAInterface { public: - explicit AccelerateDMA(BufferCache& buffer_cache); + explicit AccelerateDMA(BufferCache& buffer_cache, TextureCache& texture_cache); bool BufferCopy(GPUVAddr src_address, GPUVAddr dest_address, u64 amount) override; bool BufferClear(GPUVAddr src_address, u64 amount, u32 value) override; + bool ImageToBuffer(const Tegra::DMA::ImageCopy& copy_info, const Tegra::DMA::ImageOperand& src, + const Tegra::DMA::BufferOperand& dst) override; + + bool BufferToImage(const Tegra::DMA::ImageCopy& copy_info, const Tegra::DMA::BufferOperand& src, + const Tegra::DMA::ImageOperand& dst) override; + private: + template <bool IS_IMAGE_UPLOAD> + bool DmaBufferImageCopy(const Tegra::DMA::ImageCopy& copy_info, + const Tegra::DMA::BufferOperand& src, + const Tegra::DMA::ImageOperand& dst); + BufferCache& buffer_cache; + TextureCache& texture_cache; }; class RasterizerOpenGL : public VideoCore::RasterizerAccelerated, @@ -150,7 +162,7 @@ private: /// Syncs the cull mode to match the guest state void SyncCullMode(); - /// Syncs the primitve restart to match the guest state + /// Syncs the primitive restart to match the guest state void SyncPrimitiveRestart(); /// Syncs the depth test state to match the guest state @@ -234,7 +246,7 @@ private: std::array<GLuint, MAX_TEXTURES> texture_handles{}; std::array<GLuint, MAX_IMAGES> image_handles{}; - /// Number of commands queued to the OpenGL driver. Resetted on flush. + /// Number of commands queued to the OpenGL driver. Reset on flush. size_t num_queued_commands = 0; bool has_written_global_memory = false; diff --git a/src/video_core/renderer_opengl/gl_resource_manager.cpp b/src/video_core/renderer_opengl/gl_resource_manager.cpp index 3a664fdec..eae8fd110 100644 --- a/src/video_core/renderer_opengl/gl_resource_manager.cpp +++ b/src/video_core/renderer_opengl/gl_resource_manager.cpp @@ -3,6 +3,7 @@ #include <string_view> #include <glad/glad.h> +#include "common/assert.h" #include "common/microprofile.h" #include "video_core/renderer_opengl/gl_resource_manager.h" #include "video_core/renderer_opengl/gl_shader_util.h" @@ -158,6 +159,15 @@ void OGLSync::Release() { handle = 0; } +bool OGLSync::IsSignaled() const noexcept { + // At least on Nvidia, glClientWaitSync with a timeout of 0 + // is faster than glGetSynciv of GL_SYNC_STATUS. + // Timeout of 0 means this check is non-blocking. + const auto sync_status = glClientWaitSync(handle, 0, 0); + ASSERT(sync_status != GL_WAIT_FAILED); + return sync_status != GL_TIMEOUT_EXPIRED; +} + void OGLFramebuffer::Create() { if (handle != 0) return; diff --git a/src/video_core/renderer_opengl/gl_resource_manager.h b/src/video_core/renderer_opengl/gl_resource_manager.h index bc05ba4bd..77362acd2 100644 --- a/src/video_core/renderer_opengl/gl_resource_manager.h +++ b/src/video_core/renderer_opengl/gl_resource_manager.h @@ -263,6 +263,9 @@ public: /// Deletes the internal OpenGL resource void Release(); + /// Checks if the sync has been signaled + bool IsSignaled() const noexcept; + GLsync handle = 0; }; diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index b047e7b3d..0b9c4a904 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp @@ -112,13 +112,17 @@ GLenum ImageTarget(Shader::TextureType type, int num_samples = 1) { return GL_NONE; } -GLenum TextureMode(PixelFormat format, bool is_first) { +GLenum TextureMode(PixelFormat format, std::array<SwizzleSource, 4> swizzle) { + bool any_r = + std::ranges::any_of(swizzle, [](SwizzleSource s) { return s == SwizzleSource::R; }); switch (format) { case PixelFormat::D24_UNORM_S8_UINT: case PixelFormat::D32_FLOAT_S8_UINT: - return is_first ? GL_DEPTH_COMPONENT : GL_STENCIL_INDEX; + // R = depth, G = stencil + return any_r ? GL_DEPTH_COMPONENT : GL_STENCIL_INDEX; case PixelFormat::S8_UINT_D24_UNORM: - return is_first ? GL_STENCIL_INDEX : GL_DEPTH_COMPONENT; + // R = stencil, G = depth + return any_r ? GL_STENCIL_INDEX : GL_DEPTH_COMPONENT; default: ASSERT(false); return GL_DEPTH_COMPONENT; @@ -208,8 +212,7 @@ void ApplySwizzle(GLuint handle, PixelFormat format, std::array<SwizzleSource, 4 case PixelFormat::D32_FLOAT_S8_UINT: case PixelFormat::S8_UINT_D24_UNORM: UNIMPLEMENTED_IF(swizzle[0] != SwizzleSource::R && swizzle[0] != SwizzleSource::G); - glTextureParameteri(handle, GL_DEPTH_STENCIL_TEXTURE_MODE, - TextureMode(format, swizzle[0] == SwizzleSource::R)); + glTextureParameteri(handle, GL_DEPTH_STENCIL_TEXTURE_MODE, TextureMode(format, swizzle)); std::ranges::transform(swizzle, swizzle.begin(), ConvertGreenRed); break; case PixelFormat::A5B5G5R1_UNORM: { @@ -714,9 +717,7 @@ std::optional<size_t> TextureCacheRuntime::StagingBuffers::FindBuffer(size_t req continue; } if (syncs[index].handle != 0) { - GLint status; - glGetSynciv(syncs[index].handle, GL_SYNC_STATUS, 1, nullptr, &status); - if (status != GL_SIGNALED) { + if (!syncs[index].IsSignaled()) { continue; } syncs[index].Release(); @@ -762,14 +763,14 @@ Image::Image(const VideoCommon::NullImageParams& params) : VideoCommon::ImageBas Image::~Image() = default; -void Image::UploadMemory(const ImageBufferMap& map, +void Image::UploadMemory(GLuint buffer_handle, size_t buffer_offset, std::span<const VideoCommon::BufferImageCopy> copies) { const bool is_rescaled = True(flags & ImageFlagBits::Rescaled); if (is_rescaled) { ScaleDown(true); } - glBindBuffer(GL_PIXEL_UNPACK_BUFFER, map.buffer); - glFlushMappedBufferRange(GL_PIXEL_UNPACK_BUFFER, map.offset, unswizzled_size_bytes); + glBindBuffer(GL_PIXEL_UNPACK_BUFFER, buffer_handle); + glFlushMappedBufferRange(GL_PIXEL_UNPACK_BUFFER, buffer_offset, unswizzled_size_bytes); glPixelStorei(GL_UNPACK_ALIGNMENT, 1); @@ -788,21 +789,26 @@ void Image::UploadMemory(const ImageBufferMap& map, current_image_height = copy.buffer_image_height; glPixelStorei(GL_UNPACK_IMAGE_HEIGHT, current_image_height); } - CopyBufferToImage(copy, map.offset); + CopyBufferToImage(copy, buffer_offset); } if (is_rescaled) { ScaleUp(); } } -void Image::DownloadMemory(ImageBufferMap& map, +void Image::UploadMemory(const ImageBufferMap& map, + std::span<const VideoCommon::BufferImageCopy> copies) { + UploadMemory(map.buffer, map.offset, copies); +} + +void Image::DownloadMemory(GLuint buffer_handle, size_t buffer_offset, std::span<const VideoCommon::BufferImageCopy> copies) { const bool is_rescaled = True(flags & ImageFlagBits::Rescaled); if (is_rescaled) { ScaleDown(); } glMemoryBarrier(GL_PIXEL_BUFFER_BARRIER_BIT); // TODO: Move this to its own API - glBindBuffer(GL_PIXEL_PACK_BUFFER, map.buffer); + glBindBuffer(GL_PIXEL_PACK_BUFFER, buffer_handle); glPixelStorei(GL_PACK_ALIGNMENT, 1); u32 current_row_length = std::numeric_limits<u32>::max(); @@ -820,13 +826,18 @@ void Image::DownloadMemory(ImageBufferMap& map, current_image_height = copy.buffer_image_height; glPixelStorei(GL_PACK_IMAGE_HEIGHT, current_image_height); } - CopyImageToBuffer(copy, map.offset); + CopyImageToBuffer(copy, buffer_offset); } if (is_rescaled) { ScaleUp(true); } } +void Image::DownloadMemory(ImageBufferMap& map, + std::span<const VideoCommon::BufferImageCopy> copies) { + DownloadMemory(map.buffer, map.offset, copies); +} + GLuint Image::StorageHandle() noexcept { switch (info.format) { case PixelFormat::A8B8G8R8_SRGB: diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h index e30875496..911e4607a 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.h +++ b/src/video_core/renderer_opengl/gl_texture_cache.h @@ -206,9 +206,15 @@ public: Image(Image&&) = default; Image& operator=(Image&&) = default; + void UploadMemory(GLuint buffer_handle, size_t buffer_offset, + std::span<const VideoCommon::BufferImageCopy> copies); + void UploadMemory(const ImageBufferMap& map, std::span<const VideoCommon::BufferImageCopy> copies); + void DownloadMemory(GLuint buffer_handle, size_t buffer_offset, + std::span<const VideoCommon::BufferImageCopy> copies); + void DownloadMemory(ImageBufferMap& map, std::span<const VideoCommon::BufferImageCopy> copies); GLuint StorageHandle() noexcept; diff --git a/src/video_core/renderer_vulkan/fixed_pipeline_state.cpp b/src/video_core/renderer_vulkan/fixed_pipeline_state.cpp index f8398b511..e7df32d84 100644 --- a/src/video_core/renderer_vulkan/fixed_pipeline_state.cpp +++ b/src/video_core/renderer_vulkan/fixed_pipeline_state.cpp @@ -271,7 +271,7 @@ bool FixedPipelineState::operator==(const FixedPipelineState& rhs) const noexcep u32 FixedPipelineState::PackComparisonOp(Maxwell::ComparisonOp op) noexcept { // OpenGL enums go from 0x200 to 0x207 and the others from 1 to 8 - // If we substract 0x200 to OpenGL enums and 1 to the others we get a 0-7 range. + // If we subtract 0x200 to OpenGL enums and 1 to the others we get a 0-7 range. // Perfect for a hash. const u32 value = static_cast<u32>(op); return value - (value >= 0x200 ? 0x200 : 1); @@ -322,8 +322,8 @@ Maxwell::StencilOp::Op FixedPipelineState::UnpackStencilOp(u32 packed) noexcept } u32 FixedPipelineState::PackCullFace(Maxwell::CullFace cull) noexcept { - // FrontAndBack is 0x408, by substracting 0x406 in it we get 2. - // Individual cull faces are in 0x404 and 0x405, substracting 0x404 we get 0 and 1. + // FrontAndBack is 0x408, by subtracting 0x406 in it we get 2. + // Individual cull faces are in 0x404 and 0x405, subtracting 0x404 we get 0 and 1. const u32 value = static_cast<u32>(cull); return value - (value == 0x408 ? 0x406 : 0x404); } diff --git a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp index ca52e2389..5dce51be8 100644 --- a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp +++ b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp @@ -166,7 +166,7 @@ struct FormatTuple { {VK_FORMAT_R16G16_UINT, Attachable | Storage}, // R16G16_UINT {VK_FORMAT_R16G16_SINT, Attachable | Storage}, // R16G16_SINT {VK_FORMAT_R16G16_SNORM, Attachable | Storage}, // R16G16_SNORM - {VK_FORMAT_UNDEFINED}, // R32G32B32_FLOAT + {VK_FORMAT_R32G32B32_SFLOAT}, // R32G32B32_FLOAT {VK_FORMAT_A8B8G8R8_SRGB_PACK32, Attachable}, // A8B8G8R8_SRGB {VK_FORMAT_R8G8_UNORM, Attachable | Storage}, // R8G8_UNORM {VK_FORMAT_R8G8_SNORM, Attachable | Storage}, // R8G8_SNORM @@ -234,11 +234,6 @@ FormatInfo SurfaceFormat(const Device& device, FormatType format_type, bool with PixelFormat pixel_format) { ASSERT(static_cast<size_t>(pixel_format) < std::size(tex_format_tuples)); FormatTuple tuple = tex_format_tuples[static_cast<size_t>(pixel_format)]; - if (tuple.format == VK_FORMAT_UNDEFINED) { - UNIMPLEMENTED_MSG("Unimplemented texture format with pixel format={}", pixel_format); - return FormatInfo{VK_FORMAT_A8B8G8R8_UNORM_PACK32, true, true}; - } - // Use A8B8G8R8_UNORM on hardware that doesn't support ASTC natively if (!device.IsOptimalAstcSupported() && VideoCore::Surface::IsPixelFormatASTC(pixel_format)) { const bool is_srgb = with_srgb && VideoCore::Surface::IsPixelFormatSRGB(pixel_format); diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp index b0153a502..9cbcb3c8f 100644 --- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp @@ -238,7 +238,7 @@ private: return indices; } - void MakeAndUpdateIndices(u8* staging_data, size_t quad_size, u32 quad, u32 first) { + void MakeAndUpdateIndices(u8* staging_data, size_t quad_size, u32 quad, u32 first) override { switch (index_type) { case VK_INDEX_TYPE_UINT8_EXT: std::memcpy(staging_data, MakeIndices<u8>(quad, first).data(), quad_size); @@ -278,7 +278,7 @@ private: return indices; } - void MakeAndUpdateIndices(u8* staging_data, size_t quad_size, u32 quad, u32 first) { + void MakeAndUpdateIndices(u8* staging_data, size_t quad_size, u32 quad, u32 first) override { switch (index_type) { case VK_INDEX_TYPE_UINT8_EXT: std::memcpy(staging_data, MakeIndices<u8>(quad, first).data(), quad_size); diff --git a/src/video_core/renderer_vulkan/vk_command_pool.cpp b/src/video_core/renderer_vulkan/vk_command_pool.cpp index 2f09de1c1..d0dbf7ca5 100644 --- a/src/video_core/renderer_vulkan/vk_command_pool.cpp +++ b/src/video_core/renderer_vulkan/vk_command_pool.cpp @@ -22,8 +22,8 @@ CommandPool::CommandPool(MasterSemaphore& master_semaphore_, const Device& devic CommandPool::~CommandPool() = default; void CommandPool::Allocate(size_t begin, size_t end) { - // Command buffers are going to be commited, recorded, executed every single usage cycle. - // They are also going to be reseted when commited. + // Command buffers are going to be committed, recorded, executed every single usage cycle. + // They are also going to be reset when committed. Pool& pool = pools.emplace_back(); pool.handle = device.GetLogical().CreateCommandPool({ .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO, diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index 719edbcfb..673ab478e 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -172,7 +172,7 @@ RasterizerVulkan::RasterizerVulkan(Core::Frontend::EmuWindow& emu_window_, Tegra buffer_cache(*this, cpu_memory_, buffer_cache_runtime), pipeline_cache(*this, device, scheduler, descriptor_pool, update_descriptor_queue, render_pass_cache, buffer_cache, texture_cache, gpu.ShaderNotify()), - query_cache{*this, device, scheduler}, accelerate_dma{buffer_cache}, + query_cache{*this, device, scheduler}, accelerate_dma(buffer_cache, texture_cache, scheduler), fence_manager(*this, gpu, texture_cache, buffer_cache, query_cache, device, scheduler), wfi_event(device.GetLogical().CreateEvent()) { scheduler.SetQueryCache(query_cache); @@ -671,7 +671,7 @@ bool RasterizerVulkan::AccelerateConditionalRendering() { // TODO(Blinkhawk): Reimplement Host conditional rendering. return false; } - // Medium / Low Hack: stub any checks on queries writen into the buffer cache. + // Medium / Low Hack: stub any checks on queries written into the buffer cache. const GPUVAddr condition_address{maxwell3d->regs.render_enable.Address()}; Maxwell::ReportSemaphore::Compare cmp; if (gpu_memory->IsMemoryDirty(condition_address, sizeof(cmp), @@ -756,7 +756,9 @@ void RasterizerVulkan::FlushWork() { draw_counter = 0; } -AccelerateDMA::AccelerateDMA(BufferCache& buffer_cache_) : buffer_cache{buffer_cache_} {} +AccelerateDMA::AccelerateDMA(BufferCache& buffer_cache_, TextureCache& texture_cache_, + Scheduler& scheduler_) + : buffer_cache{buffer_cache_}, texture_cache{texture_cache_}, scheduler{scheduler_} {} bool AccelerateDMA::BufferClear(GPUVAddr src_address, u64 amount, u32 value) { std::scoped_lock lock{buffer_cache.mutex}; @@ -768,6 +770,46 @@ bool AccelerateDMA::BufferCopy(GPUVAddr src_address, GPUVAddr dest_address, u64 return buffer_cache.DMACopy(src_address, dest_address, amount); } +template <bool IS_IMAGE_UPLOAD> +bool AccelerateDMA::DmaBufferImageCopy(const Tegra::DMA::ImageCopy& copy_info, + const Tegra::DMA::BufferOperand& buffer_operand, + const Tegra::DMA::ImageOperand& image_operand) { + std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex}; + const auto image_id = texture_cache.DmaImageId(image_operand); + if (image_id == VideoCommon::NULL_IMAGE_ID) { + return false; + } + const u32 buffer_size = static_cast<u32>(buffer_operand.pitch * buffer_operand.height); + static constexpr auto sync_info = VideoCommon::ObtainBufferSynchronize::FullSynchronize; + const auto post_op = IS_IMAGE_UPLOAD ? VideoCommon::ObtainBufferOperation::DoNothing + : VideoCommon::ObtainBufferOperation::MarkAsWritten; + const auto [buffer, offset] = + buffer_cache.ObtainBuffer(buffer_operand.address, buffer_size, sync_info, post_op); + + const auto [image, copy] = texture_cache.DmaBufferImageCopy( + copy_info, buffer_operand, image_operand, image_id, IS_IMAGE_UPLOAD); + const std::span copy_span{©, 1}; + + if constexpr (IS_IMAGE_UPLOAD) { + image->UploadMemory(buffer->Handle(), offset, copy_span); + } else { + image->DownloadMemory(buffer->Handle(), offset, copy_span); + } + return true; +} + +bool AccelerateDMA::ImageToBuffer(const Tegra::DMA::ImageCopy& copy_info, + const Tegra::DMA::ImageOperand& image_operand, + const Tegra::DMA::BufferOperand& buffer_operand) { + return DmaBufferImageCopy<false>(copy_info, buffer_operand, image_operand); +} + +bool AccelerateDMA::BufferToImage(const Tegra::DMA::ImageCopy& copy_info, + const Tegra::DMA::BufferOperand& buffer_operand, + const Tegra::DMA::ImageOperand& image_operand) { + return DmaBufferImageCopy<true>(copy_info, buffer_operand, image_operand); +} + void RasterizerVulkan::UpdateDynamicStates() { auto& regs = maxwell3d->regs; UpdateViewportsState(regs); @@ -1064,7 +1106,7 @@ void RasterizerVulkan::UpdateDepthBoundsTestEnable(Tegra::Engines::Maxwell3D::Re LOG_WARNING(Render_Vulkan, "Depth bounds is enabled but not supported"); enabled = false; } - scheduler.Record([enable = regs.depth_bounds_enable](vk::CommandBuffer cmdbuf) { + scheduler.Record([enable = enabled](vk::CommandBuffer cmdbuf) { cmdbuf.SetDepthBoundsTestEnableEXT(enable); }); } diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h index a0508b57c..1659fbc13 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.h +++ b/src/video_core/renderer_vulkan/vk_rasterizer.h @@ -45,14 +45,28 @@ class StateTracker; class AccelerateDMA : public Tegra::Engines::AccelerateDMAInterface { public: - explicit AccelerateDMA(BufferCache& buffer_cache); + explicit AccelerateDMA(BufferCache& buffer_cache, TextureCache& texture_cache, + Scheduler& scheduler); bool BufferCopy(GPUVAddr start_address, GPUVAddr end_address, u64 amount) override; bool BufferClear(GPUVAddr src_address, u64 amount, u32 value) override; + bool ImageToBuffer(const Tegra::DMA::ImageCopy& copy_info, const Tegra::DMA::ImageOperand& src, + const Tegra::DMA::BufferOperand& dst) override; + + bool BufferToImage(const Tegra::DMA::ImageCopy& copy_info, const Tegra::DMA::BufferOperand& src, + const Tegra::DMA::ImageOperand& dst) override; + private: + template <bool IS_IMAGE_UPLOAD> + bool DmaBufferImageCopy(const Tegra::DMA::ImageCopy& copy_info, + const Tegra::DMA::BufferOperand& src, + const Tegra::DMA::ImageOperand& dst); + BufferCache& buffer_cache; + TextureCache& texture_cache; + Scheduler& scheduler; }; class RasterizerVulkan final : public VideoCore::RasterizerAccelerated, diff --git a/src/video_core/renderer_vulkan/vk_resource_pool.cpp b/src/video_core/renderer_vulkan/vk_resource_pool.cpp index 6c8ac22f4..6572f82ba 100644 --- a/src/video_core/renderer_vulkan/vk_resource_pool.cpp +++ b/src/video_core/renderer_vulkan/vk_resource_pool.cpp @@ -37,7 +37,7 @@ size_t ResourcePool::CommitResource() { found = free_resource; } } - // Free iterator is hinted to the resource after the one that's been commited. + // Free iterator is hinted to the resource after the one that's been committed. hint_iterator = (*found + 1) % ticks.size(); return *found; } @@ -46,7 +46,7 @@ size_t ResourcePool::ManageOverflow() { const size_t old_capacity = ticks.size(); Grow(); - // The last entry is guaranted to be free, since it's the first element of the freshly + // The last entry is guaranteed to be free, since it's the first element of the freshly // allocated resources. return old_capacity; } diff --git a/src/video_core/renderer_vulkan/vk_scheduler.cpp b/src/video_core/renderer_vulkan/vk_scheduler.cpp index e03685af1..c636a1625 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.cpp +++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp @@ -47,14 +47,15 @@ Scheduler::Scheduler(const Device& device_, StateTracker& state_tracker_) Scheduler::~Scheduler() = default; void Scheduler::Flush(VkSemaphore signal_semaphore, VkSemaphore wait_semaphore) { + // When flushing, we only send data to the worker thread; no waiting is necessary. SubmitExecution(signal_semaphore, wait_semaphore); AllocateNewContext(); } void Scheduler::Finish(VkSemaphore signal_semaphore, VkSemaphore wait_semaphore) { + // When finishing, we need to wait for the submission to have executed on the device. const u64 presubmit_tick = CurrentTick(); SubmitExecution(signal_semaphore, wait_semaphore); - WaitWorker(); Wait(presubmit_tick); AllocateNewContext(); } @@ -63,8 +64,13 @@ void Scheduler::WaitWorker() { MICROPROFILE_SCOPE(Vulkan_WaitForWorker); DispatchWork(); - std::unique_lock lock{work_mutex}; - wait_cv.wait(lock, [this] { return work_queue.empty(); }); + // Ensure the queue is drained. + std::unique_lock ql{queue_mutex}; + event_cv.wait(ql, [this] { return work_queue.empty(); }); + + // Now wait for execution to finish. + // This needs to be done in the same order as WorkerThread. + std::unique_lock el{execution_mutex}; } void Scheduler::DispatchWork() { @@ -72,10 +78,10 @@ void Scheduler::DispatchWork() { return; } { - std::scoped_lock lock{work_mutex}; + std::scoped_lock ql{queue_mutex}; work_queue.push(std::move(chunk)); } - work_cv.notify_one(); + event_cv.notify_all(); AcquireNewChunk(); } @@ -137,30 +143,55 @@ bool Scheduler::UpdateRescaling(bool is_rescaling) { void Scheduler::WorkerThread(std::stop_token stop_token) { Common::SetCurrentThreadName("VulkanWorker"); - do { + + const auto TryPopQueue{[this](auto& work) -> bool { + if (work_queue.empty()) { + return false; + } + + work = std::move(work_queue.front()); + work_queue.pop(); + event_cv.notify_all(); + return true; + }}; + + while (!stop_token.stop_requested()) { std::unique_ptr<CommandChunk> work; - bool has_submit{false}; + { - std::unique_lock lock{work_mutex}; - if (work_queue.empty()) { - wait_cv.notify_all(); - } - Common::CondvarWait(work_cv, lock, stop_token, [&] { return !work_queue.empty(); }); + std::unique_lock lk{queue_mutex}; + + // Wait for work. + Common::CondvarWait(event_cv, lk, stop_token, [&] { return TryPopQueue(work); }); + + // If we've been asked to stop, we're done. if (stop_token.stop_requested()) { - continue; + return; } - work = std::move(work_queue.front()); - work_queue.pop(); - has_submit = work->HasSubmit(); + // Exchange lock ownership so that we take the execution lock before + // the queue lock goes out of scope. This allows us to force execution + // to complete in the next step. + std::exchange(lk, std::unique_lock{execution_mutex}); + + // Perform the work, tracking whether the chunk was a submission + // before executing. + const bool has_submit = work->HasSubmit(); work->ExecuteAll(current_cmdbuf); + + // If the chunk was a submission, reallocate the command buffer. + if (has_submit) { + AllocateWorkerCommandBuffer(); + } } - if (has_submit) { - AllocateWorkerCommandBuffer(); + + { + std::scoped_lock rl{reserve_mutex}; + + // Recycle the chunk back to the reserve. + chunk_reserve.emplace_back(std::move(work)); } - std::scoped_lock reserve_lock{reserve_mutex}; - chunk_reserve.push_back(std::move(work)); - } while (!stop_token.stop_requested()); + } } void Scheduler::AllocateWorkerCommandBuffer() { @@ -289,13 +320,16 @@ void Scheduler::EndRenderPass() { } void Scheduler::AcquireNewChunk() { - std::scoped_lock lock{reserve_mutex}; + std::scoped_lock rl{reserve_mutex}; + if (chunk_reserve.empty()) { + // If we don't have anything reserved, we need to make a new chunk. chunk = std::make_unique<CommandChunk>(); - return; + } else { + // Otherwise, we can just take from the reserve. + chunk = std::make_unique<CommandChunk>(); + chunk_reserve.pop_back(); } - chunk = std::move(chunk_reserve.back()); - chunk_reserve.pop_back(); } } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_scheduler.h b/src/video_core/renderer_vulkan/vk_scheduler.h index bd4cb0f7e..8d75ce987 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.h +++ b/src/video_core/renderer_vulkan/vk_scheduler.h @@ -232,10 +232,10 @@ private: std::queue<std::unique_ptr<CommandChunk>> work_queue; std::vector<std::unique_ptr<CommandChunk>> chunk_reserve; + std::mutex execution_mutex; std::mutex reserve_mutex; - std::mutex work_mutex; - std::condition_variable_any work_cv; - std::condition_variable wait_cv; + std::mutex queue_mutex; + std::condition_variable_any event_cv; std::jthread worker_thread; }; diff --git a/src/video_core/renderer_vulkan/vk_swapchain.cpp b/src/video_core/renderer_vulkan/vk_swapchain.cpp index b6810eef9..85fdce6e5 100644 --- a/src/video_core/renderer_vulkan/vk_swapchain.cpp +++ b/src/video_core/renderer_vulkan/vk_swapchain.cpp @@ -159,7 +159,7 @@ void Swapchain::CreateSwapchain(const VkSurfaceCapabilitiesKHR& capabilities, bo present_mode = ChooseSwapPresentMode(present_modes); u32 requested_image_count{capabilities.minImageCount + 1}; - // Ensure Tripple buffering if possible. + // Ensure Triple buffering if possible. if (capabilities.maxImageCount > 0) { if (requested_image_count > capabilities.maxImageCount) { requested_image_count = capabilities.maxImageCount; diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp index 80adb70eb..ae15f6976 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -189,13 +189,16 @@ constexpr VkBorderColor ConvertBorderColor(const std::array<float, 4>& color) { if (info.IsRenderTarget()) { return ImageAspectMask(info.format); } - const bool is_first = info.Swizzle()[0] == SwizzleSource::R; + bool any_r = + std::ranges::any_of(info.Swizzle(), [](SwizzleSource s) { return s == SwizzleSource::R; }); switch (info.format) { case PixelFormat::D24_UNORM_S8_UINT: case PixelFormat::D32_FLOAT_S8_UINT: - return is_first ? VK_IMAGE_ASPECT_DEPTH_BIT : VK_IMAGE_ASPECT_STENCIL_BIT; + // R = depth, G = stencil + return any_r ? VK_IMAGE_ASPECT_DEPTH_BIT : VK_IMAGE_ASPECT_STENCIL_BIT; case PixelFormat::S8_UINT_D24_UNORM: - return is_first ? VK_IMAGE_ASPECT_STENCIL_BIT : VK_IMAGE_ASPECT_DEPTH_BIT; + // R = stencil, G = depth + return any_r ? VK_IMAGE_ASPECT_STENCIL_BIT : VK_IMAGE_ASPECT_DEPTH_BIT; case PixelFormat::D16_UNORM: case PixelFormat::D32_FLOAT: return VK_IMAGE_ASPECT_DEPTH_BIT; @@ -864,13 +867,19 @@ void TextureCacheRuntime::ReinterpretImage(Image& dst, Image& src, const VkImageAspectFlags src_aspect_mask = src.AspectMask(); const VkImageAspectFlags dst_aspect_mask = dst.AspectMask(); - std::ranges::transform(copies, vk_in_copies.begin(), [src_aspect_mask](const auto& copy) { - return MakeBufferImageCopy(copy, true, src_aspect_mask); - }); + const auto bpp_in = BytesPerBlock(src.info.format) / DefaultBlockWidth(src.info.format); + const auto bpp_out = BytesPerBlock(dst.info.format) / DefaultBlockWidth(dst.info.format); + std::ranges::transform(copies, vk_in_copies.begin(), + [src_aspect_mask, bpp_in, bpp_out](const auto& copy) { + auto copy2 = copy; + copy2.src_offset.x = (bpp_out * copy.src_offset.x) / bpp_in; + copy2.extent.width = (bpp_out * copy.extent.width) / bpp_in; + return MakeBufferImageCopy(copy2, true, src_aspect_mask); + }); std::ranges::transform(copies, vk_out_copies.begin(), [dst_aspect_mask](const auto& copy) { return MakeBufferImageCopy(copy, false, dst_aspect_mask); }); - const u32 img_bpp = BytesPerBlock(src.info.format); + const u32 img_bpp = BytesPerBlock(dst.info.format); size_t total_size = 0; for (const auto& copy : copies) { total_size += copy.extent.width * copy.extent.height * copy.extent.depth * img_bpp; @@ -1306,15 +1315,16 @@ Image::Image(const VideoCommon::NullImageParams& params) : VideoCommon::ImageBas Image::~Image() = default; -void Image::UploadMemory(const StagingBufferRef& map, std::span<const BufferImageCopy> copies) { +void Image::UploadMemory(VkBuffer buffer, VkDeviceSize offset, + std::span<const VideoCommon::BufferImageCopy> copies) { // TODO: Move this to another API const bool is_rescaled = True(flags & ImageFlagBits::Rescaled); if (is_rescaled) { ScaleDown(true); } scheduler->RequestOutsideRenderPassOperationContext(); - std::vector vk_copies = TransformBufferImageCopies(copies, map.offset, aspect_mask); - const VkBuffer src_buffer = map.buffer; + std::vector vk_copies = TransformBufferImageCopies(copies, offset, aspect_mask); + const VkBuffer src_buffer = buffer; const VkImage vk_image = *original_image; const VkImageAspectFlags vk_aspect_mask = aspect_mask; const bool is_initialized = std::exchange(initialized, true); @@ -1327,14 +1337,19 @@ void Image::UploadMemory(const StagingBufferRef& map, std::span<const BufferImag } } -void Image::DownloadMemory(const StagingBufferRef& map, std::span<const BufferImageCopy> copies) { +void Image::UploadMemory(const StagingBufferRef& map, std::span<const BufferImageCopy> copies) { + UploadMemory(map.buffer, map.offset, copies); +} + +void Image::DownloadMemory(VkBuffer buffer, VkDeviceSize offset, + std::span<const VideoCommon::BufferImageCopy> copies) { const bool is_rescaled = True(flags & ImageFlagBits::Rescaled); if (is_rescaled) { ScaleDown(); } - std::vector vk_copies = TransformBufferImageCopies(copies, map.offset, aspect_mask); + std::vector vk_copies = TransformBufferImageCopies(copies, offset, aspect_mask); scheduler->RequestOutsideRenderPassOperationContext(); - scheduler->Record([buffer = map.buffer, image = *original_image, aspect_mask = aspect_mask, + scheduler->Record([buffer, image = *original_image, aspect_mask = aspect_mask, vk_copies](vk::CommandBuffer cmdbuf) { const VkImageMemoryBarrier read_barrier{ .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, @@ -1389,6 +1404,10 @@ void Image::DownloadMemory(const StagingBufferRef& map, std::span<const BufferIm } } +void Image::DownloadMemory(const StagingBufferRef& map, std::span<const BufferImageCopy> copies) { + DownloadMemory(map.buffer, map.offset, copies); +} + bool Image::IsRescaled() const noexcept { return True(flags & ImageFlagBits::Rescaled); } @@ -1763,7 +1782,7 @@ Sampler::Sampler(TextureCacheRuntime& runtime, const Tegra::Texture::TSCEntry& t .minLod = tsc.mipmap_filter == TextureMipmapFilter::None ? 0.0f : tsc.MinLod(), .maxLod = tsc.mipmap_filter == TextureMipmapFilter::None ? 0.25f : tsc.MaxLod(), .borderColor = - arbitrary_borders ? VK_BORDER_COLOR_INT_CUSTOM_EXT : ConvertBorderColor(color), + arbitrary_borders ? VK_BORDER_COLOR_FLOAT_CUSTOM_EXT : ConvertBorderColor(color), .unnormalizedCoordinates = VK_FALSE, }); } diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.h b/src/video_core/renderer_vulkan/vk_texture_cache.h index 0ce39616f..d5ee23f8d 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.h +++ b/src/video_core/renderer_vulkan/vk_texture_cache.h @@ -132,9 +132,15 @@ public: Image(Image&&) = default; Image& operator=(Image&&) = default; + void UploadMemory(VkBuffer buffer, VkDeviceSize offset, + std::span<const VideoCommon::BufferImageCopy> copies); + void UploadMemory(const StagingBufferRef& map, std::span<const VideoCommon::BufferImageCopy> copies); + void DownloadMemory(VkBuffer buffer, VkDeviceSize offset, + std::span<const VideoCommon::BufferImageCopy> copies); + void DownloadMemory(const StagingBufferRef& map, std::span<const VideoCommon::BufferImageCopy> copies); diff --git a/src/video_core/renderer_vulkan/vk_update_descriptor.cpp b/src/video_core/renderer_vulkan/vk_update_descriptor.cpp index 4d4a6753b..009dab0b6 100644 --- a/src/video_core/renderer_vulkan/vk_update_descriptor.cpp +++ b/src/video_core/renderer_vulkan/vk_update_descriptor.cpp @@ -25,7 +25,7 @@ void UpdateDescriptorQueue::TickFrame() { void UpdateDescriptorQueue::Acquire() { // Minimum number of entries required. - // This is the maximum number of entries a single draw call migth use. + // This is the maximum number of entries a single draw call might use. static constexpr size_t MIN_ENTRIES = 0x400; if (std::distance(payload.data(), payload_cursor) + MIN_ENTRIES >= payload.max_size()) { diff --git a/src/video_core/texture_cache/format_lookup_table.cpp b/src/video_core/texture_cache/format_lookup_table.cpp index 08aa8ca33..5fc2b2fec 100644 --- a/src/video_core/texture_cache/format_lookup_table.cpp +++ b/src/video_core/texture_cache/format_lookup_table.cpp @@ -42,15 +42,15 @@ PixelFormat PixelFormatFromTextureInfo(TextureFormat format, ComponentType red, ComponentType blue, ComponentType alpha, bool is_srgb) noexcept { switch (Hash(format, red, green, blue, alpha, is_srgb)) { - case Hash(TextureFormat::A8R8G8B8, UNORM): + case Hash(TextureFormat::A8B8G8R8, UNORM): return PixelFormat::A8B8G8R8_UNORM; - case Hash(TextureFormat::A8R8G8B8, SNORM): + case Hash(TextureFormat::A8B8G8R8, SNORM): return PixelFormat::A8B8G8R8_SNORM; - case Hash(TextureFormat::A8R8G8B8, UINT): + case Hash(TextureFormat::A8B8G8R8, UINT): return PixelFormat::A8B8G8R8_UINT; - case Hash(TextureFormat::A8R8G8B8, SINT): + case Hash(TextureFormat::A8B8G8R8, SINT): return PixelFormat::A8B8G8R8_SINT; - case Hash(TextureFormat::A8R8G8B8, UNORM, SRGB): + case Hash(TextureFormat::A8B8G8R8, UNORM, SRGB): return PixelFormat::A8B8G8R8_SRGB; case Hash(TextureFormat::B5G6R5, UNORM): return PixelFormat::B5G6R5_UNORM; @@ -74,13 +74,13 @@ PixelFormat PixelFormatFromTextureInfo(TextureFormat format, ComponentType red, return PixelFormat::R8_UINT; case Hash(TextureFormat::R8, SINT): return PixelFormat::R8_SINT; - case Hash(TextureFormat::R8G8, UNORM): + case Hash(TextureFormat::G8R8, UNORM): return PixelFormat::R8G8_UNORM; - case Hash(TextureFormat::R8G8, SNORM): + case Hash(TextureFormat::G8R8, SNORM): return PixelFormat::R8G8_SNORM; - case Hash(TextureFormat::R8G8, UINT): + case Hash(TextureFormat::G8R8, UINT): return PixelFormat::R8G8_UINT; - case Hash(TextureFormat::R8G8, SINT): + case Hash(TextureFormat::G8R8, SINT): return PixelFormat::R8G8_SINT; case Hash(TextureFormat::R16G16B16A16, FLOAT): return PixelFormat::R16G16B16A16_FLOAT; @@ -136,49 +136,49 @@ PixelFormat PixelFormatFromTextureInfo(TextureFormat format, ComponentType red, return PixelFormat::R32_SINT; case Hash(TextureFormat::E5B9G9R9, FLOAT): return PixelFormat::E5B9G9R9_FLOAT; - case Hash(TextureFormat::D32, FLOAT): + case Hash(TextureFormat::Z32, FLOAT): return PixelFormat::D32_FLOAT; - case Hash(TextureFormat::D16, UNORM): + case Hash(TextureFormat::Z16, UNORM): return PixelFormat::D16_UNORM; - case Hash(TextureFormat::S8D24, UINT, UNORM, UNORM, UNORM, LINEAR): + case Hash(TextureFormat::Z24S8, UINT, UNORM, UNORM, UNORM, LINEAR): return PixelFormat::S8_UINT_D24_UNORM; - case Hash(TextureFormat::S8D24, UINT, UNORM, UINT, UINT, LINEAR): + case Hash(TextureFormat::Z24S8, UINT, UNORM, UINT, UINT, LINEAR): return PixelFormat::S8_UINT_D24_UNORM; - case Hash(TextureFormat::R8G24, UINT, UNORM, UNORM, UNORM, LINEAR): + case Hash(TextureFormat::G24R8, UINT, UNORM, UNORM, UNORM, LINEAR): return PixelFormat::S8_UINT_D24_UNORM; - case Hash(TextureFormat::D24S8, UNORM, UINT, UINT, UINT, LINEAR): + case Hash(TextureFormat::S8Z24, UNORM, UINT, UINT, UINT, LINEAR): return PixelFormat::D24_UNORM_S8_UINT; - case Hash(TextureFormat::D32S8, FLOAT, UINT, UNORM, UNORM, LINEAR): + case Hash(TextureFormat::Z32_X24S8, FLOAT, UINT, UNORM, UNORM, LINEAR): return PixelFormat::D32_FLOAT_S8_UINT; - case Hash(TextureFormat::R32_B24G8, FLOAT, UINT, UNORM, UNORM, LINEAR): + case Hash(TextureFormat::R32B24G8, FLOAT, UINT, UNORM, UNORM, LINEAR): return PixelFormat::D32_FLOAT_S8_UINT; - case Hash(TextureFormat::BC1_RGBA, UNORM, LINEAR): + case Hash(TextureFormat::DXT1, UNORM, LINEAR): return PixelFormat::BC1_RGBA_UNORM; - case Hash(TextureFormat::BC1_RGBA, UNORM, SRGB): + case Hash(TextureFormat::DXT1, UNORM, SRGB): return PixelFormat::BC1_RGBA_SRGB; - case Hash(TextureFormat::BC2, UNORM, LINEAR): + case Hash(TextureFormat::DXT23, UNORM, LINEAR): return PixelFormat::BC2_UNORM; - case Hash(TextureFormat::BC2, UNORM, SRGB): + case Hash(TextureFormat::DXT23, UNORM, SRGB): return PixelFormat::BC2_SRGB; - case Hash(TextureFormat::BC3, UNORM, LINEAR): + case Hash(TextureFormat::DXT45, UNORM, LINEAR): return PixelFormat::BC3_UNORM; - case Hash(TextureFormat::BC3, UNORM, SRGB): + case Hash(TextureFormat::DXT45, UNORM, SRGB): return PixelFormat::BC3_SRGB; - case Hash(TextureFormat::BC4, UNORM): + case Hash(TextureFormat::DXN1, UNORM): return PixelFormat::BC4_UNORM; - case Hash(TextureFormat::BC4, SNORM): + case Hash(TextureFormat::DXN1, SNORM): return PixelFormat::BC4_SNORM; - case Hash(TextureFormat::BC5, UNORM): + case Hash(TextureFormat::DXN2, UNORM): return PixelFormat::BC5_UNORM; - case Hash(TextureFormat::BC5, SNORM): + case Hash(TextureFormat::DXN2, SNORM): return PixelFormat::BC5_SNORM; - case Hash(TextureFormat::BC7, UNORM, LINEAR): + case Hash(TextureFormat::BC7U, UNORM, LINEAR): return PixelFormat::BC7_UNORM; - case Hash(TextureFormat::BC7, UNORM, SRGB): + case Hash(TextureFormat::BC7U, UNORM, SRGB): return PixelFormat::BC7_SRGB; - case Hash(TextureFormat::BC6H_SFLOAT, FLOAT): + case Hash(TextureFormat::BC6H_S16, FLOAT): return PixelFormat::BC6H_SFLOAT; - case Hash(TextureFormat::BC6H_UFLOAT, FLOAT): + case Hash(TextureFormat::BC6H_U16, FLOAT): return PixelFormat::BC6H_UFLOAT; case Hash(TextureFormat::ASTC_2D_4X4, UNORM, LINEAR): return PixelFormat::ASTC_2D_4X4_UNORM; diff --git a/src/video_core/texture_cache/image_base.h b/src/video_core/texture_cache/image_base.h index e8fa592d2..329396bb6 100644 --- a/src/video_core/texture_cache/image_base.h +++ b/src/video_core/texture_cache/image_base.h @@ -25,7 +25,7 @@ enum class ImageFlagBits : u32 { Registered = 1 << 6, ///< True when the image is registered Picked = 1 << 7, ///< Temporary flag to mark the image as picked Remapped = 1 << 8, ///< Image has been remapped. - Sparse = 1 << 9, ///< Image has non continous submemory. + Sparse = 1 << 9, ///< Image has non continuous submemory. // Garbage Collection Flags BadOverlap = 1 << 10, ///< This image overlaps other but doesn't fit, has higher diff --git a/src/video_core/texture_cache/image_info.cpp b/src/video_core/texture_cache/image_info.cpp index e9100091e..a1296b574 100644 --- a/src/video_core/texture_cache/image_info.cpp +++ b/src/video_core/texture_cache/image_info.cpp @@ -216,10 +216,51 @@ ImageInfo::ImageInfo(const Tegra::Engines::Fermi2D::Surface& config) noexcept { .height = config.height, .depth = 1, }; - rescaleable = block.depth == 0; - rescaleable &= size.height > 256; + rescaleable = block.depth == 0 && size.height > 256; downscaleable = size.height > 512; } } +static PixelFormat ByteSizeToFormat(u32 bytes_per_pixel) { + switch (bytes_per_pixel) { + case 1: + return PixelFormat::R8_UINT; + case 2: + return PixelFormat::R8G8_UINT; + case 4: + return PixelFormat::A8B8G8R8_UINT; + case 8: + return PixelFormat::R16G16B16A16_UINT; + case 16: + return PixelFormat::R32G32B32A32_UINT; + default: + UNIMPLEMENTED(); + return PixelFormat::Invalid; + } +} + +ImageInfo::ImageInfo(const Tegra::DMA::ImageOperand& config) noexcept { + const u32 bytes_per_pixel = config.bytes_per_pixel; + format = ByteSizeToFormat(bytes_per_pixel); + type = config.params.block_size.depth > 0 ? ImageType::e3D : ImageType::e2D; + num_samples = 1; + block = Extent3D{ + .width = config.params.block_size.width, + .height = config.params.block_size.height, + .depth = config.params.block_size.depth, + }; + size = Extent3D{ + .width = config.params.width, + .height = config.params.height, + .depth = config.params.depth, + }; + tile_width_spacing = 0; + resources.levels = 1; + resources.layers = 1; + layer_stride = CalculateLayerStride(*this); + maybe_unaligned_layer_stride = CalculateLayerSize(*this); + rescaleable = block.depth == 0 && size.height > 256; + downscaleable = size.height > 512; +} + } // namespace VideoCommon diff --git a/src/video_core/texture_cache/image_info.h b/src/video_core/texture_cache/image_info.h index 93755e15e..a12f5b44f 100644 --- a/src/video_core/texture_cache/image_info.h +++ b/src/video_core/texture_cache/image_info.h @@ -5,6 +5,7 @@ #include "video_core/engines/fermi_2d.h" #include "video_core/engines/maxwell_3d.h" +#include "video_core/engines/maxwell_dma.h" #include "video_core/surface.h" #include "video_core/texture_cache/types.h" @@ -19,6 +20,7 @@ struct ImageInfo { explicit ImageInfo(const Tegra::Engines::Maxwell3D::Regs& regs, size_t index) noexcept; explicit ImageInfo(const Tegra::Engines::Maxwell3D::Regs& regs) noexcept; explicit ImageInfo(const Tegra::Engines::Fermi2D::Surface& config) noexcept; + explicit ImageInfo(const Tegra::DMA::ImageOperand& config) noexcept; PixelFormat format = PixelFormat::Invalid; ImageType type = ImageType::e1D; diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index 9dd152fbe..8e8b9a5e6 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -745,6 +745,25 @@ void TextureCache<P>::PopAsyncFlushes() { } template <class P> +ImageId TextureCache<P>::DmaImageId(const Tegra::DMA::ImageOperand& operand) { + const ImageInfo dst_info(operand); + const ImageId dst_id = FindDMAImage(dst_info, operand.address); + if (!dst_id) { + return NULL_IMAGE_ID; + } + const auto& image = slot_images[dst_id]; + if (False(image.flags & ImageFlagBits::GpuModified)) { + // No need to waste time on an image that's synced with guest + return NULL_IMAGE_ID; + } + const auto base = image.TryFindBase(operand.address); + if (!base) { + return NULL_IMAGE_ID; + } + return dst_id; +} + +template <class P> bool TextureCache<P>::IsRescaling() const noexcept { return is_rescaling; } @@ -772,6 +791,49 @@ bool TextureCache<P>::IsRegionGpuModified(VAddr addr, size_t size) { } template <class P> +std::pair<typename TextureCache<P>::Image*, BufferImageCopy> TextureCache<P>::DmaBufferImageCopy( + const Tegra::DMA::ImageCopy& copy_info, const Tegra::DMA::BufferOperand& buffer_operand, + const Tegra::DMA::ImageOperand& image_operand, ImageId image_id, bool modifies_image) { + const auto [level, base] = PrepareDmaImage(image_id, image_operand.address, modifies_image); + auto* image = &slot_images[image_id]; + const u32 buffer_size = static_cast<u32>(buffer_operand.pitch * buffer_operand.height); + const u32 bpp = VideoCore::Surface::BytesPerBlock(image->info.format); + const auto convert = [old_bpp = image_operand.bytes_per_pixel, bpp](u32 value) { + return (old_bpp * value) / bpp; + }; + const u32 base_x = convert(image_operand.params.origin.x.Value()); + const u32 base_y = image_operand.params.origin.y.Value(); + const u32 length_x = convert(copy_info.length_x); + const u32 length_y = copy_info.length_y; + + const BufferImageCopy copy{ + .buffer_offset = 0, + .buffer_size = buffer_size, + .buffer_row_length = convert(buffer_operand.pitch), + .buffer_image_height = buffer_operand.height, + .image_subresource = + { + .base_level = static_cast<s32>(level), + .base_layer = static_cast<s32>(base), + .num_layers = 1, + }, + .image_offset = + { + .x = static_cast<s32>(base_x), + .y = static_cast<s32>(base_y), + .z = 0, + }, + .image_extent = + { + .width = length_x, + .height = length_y, + .depth = 1, + }, + }; + return {image, copy}; +} + +template <class P> void TextureCache<P>::RefreshContents(Image& image, ImageId image_id) { if (False(image.flags & ImageFlagBits::CpuModified)) { // Only upload modified images @@ -1359,6 +1421,63 @@ std::optional<typename TextureCache<P>::BlitImages> TextureCache<P>::GetBlitImag } template <class P> +ImageId TextureCache<P>::FindDMAImage(const ImageInfo& info, GPUVAddr gpu_addr) { + std::optional<VAddr> cpu_addr = gpu_memory->GpuToCpuAddress(gpu_addr); + if (!cpu_addr) { + cpu_addr = gpu_memory->GpuToCpuAddress(gpu_addr, CalculateGuestSizeInBytes(info)); + if (!cpu_addr) { + return ImageId{}; + } + } + ImageId image_id{}; + boost::container::small_vector<ImageId, 1> image_ids; + const auto lambda = [&](ImageId existing_image_id, ImageBase& existing_image) { + if (True(existing_image.flags & ImageFlagBits::Remapped)) { + return false; + } + if (info.type == ImageType::Linear || existing_image.info.type == ImageType::Linear) + [[unlikely]] { + const bool strict_size = True(existing_image.flags & ImageFlagBits::Strong); + const ImageInfo& existing = existing_image.info; + if (existing_image.gpu_addr == gpu_addr && existing.type == info.type && + existing.pitch == info.pitch && + IsPitchLinearSameSize(existing, info, strict_size) && + IsViewCompatible(existing.format, info.format, false, true)) { + image_id = existing_image_id; + image_ids.push_back(existing_image_id); + return true; + } + } else if (IsSubCopy(info, existing_image, gpu_addr)) { + image_id = existing_image_id; + image_ids.push_back(existing_image_id); + return true; + } + return false; + }; + ForEachImageInRegion(*cpu_addr, CalculateGuestSizeInBytes(info), lambda); + if (image_ids.size() <= 1) [[likely]] { + return image_id; + } + auto image_ids_compare = [this](ImageId a, ImageId b) { + auto& image_a = slot_images[a]; + auto& image_b = slot_images[b]; + return image_a.modification_tick < image_b.modification_tick; + }; + return *std::ranges::max_element(image_ids, image_ids_compare); +} + +template <class P> +std::pair<u32, u32> TextureCache<P>::PrepareDmaImage(ImageId dst_id, GPUVAddr base_addr, + bool mark_as_modified) { + const auto& image = slot_images[dst_id]; + const auto base = image.TryFindBase(base_addr); + PrepareImage(dst_id, mark_as_modified, false); + const auto& new_image = slot_images[dst_id]; + lru_cache.Touch(new_image.lru_index, frame_tick); + return std::make_pair(base->level, base->layer); +} + +template <class P> SamplerId TextureCache<P>::FindSampler(const TSCEntry& config) { if (std::ranges::all_of(config.raw, [](u64 value) { return value == 0; })) { return NULL_SAMPLER_ID; diff --git a/src/video_core/texture_cache/texture_cache_base.h b/src/video_core/texture_cache/texture_cache_base.h index 013836933..5a5b4179c 100644 --- a/src/video_core/texture_cache/texture_cache_base.h +++ b/src/video_core/texture_cache/texture_cache_base.h @@ -209,6 +209,12 @@ public: /// Pop asynchronous downloads void PopAsyncFlushes(); + [[nodiscard]] ImageId DmaImageId(const Tegra::DMA::ImageOperand& operand); + + [[nodiscard]] std::pair<Image*, BufferImageCopy> DmaBufferImageCopy( + const Tegra::DMA::ImageCopy& copy_info, const Tegra::DMA::BufferOperand& buffer_operand, + const Tegra::DMA::ImageOperand& image_operand, ImageId image_id, bool modifies_image); + /// Return true when a CPU region is modified from the GPU [[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size); @@ -300,6 +306,8 @@ private: /// Remove joined images from the cache [[nodiscard]] ImageId JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VAddr cpu_addr); + [[nodiscard]] ImageId FindDMAImage(const ImageInfo& info, GPUVAddr gpu_addr); + /// Return a blit image pair from the given guest blit parameters [[nodiscard]] std::optional<BlitImages> GetBlitImages( const Tegra::Engines::Fermi2D::Surface& dst, const Tegra::Engines::Fermi2D::Surface& src, @@ -381,6 +389,9 @@ private: /// Returns true if the current clear parameters clear the whole image of a given image view [[nodiscard]] bool IsFullClear(ImageViewId id); + [[nodiscard]] std::pair<u32, u32> PrepareDmaImage(ImageId dst_id, GPUVAddr base_addr, + bool mark_as_modified); + bool ImageCanRescale(ImageBase& image); void InvalidateScale(Image& image); bool ScaleUp(Image& image); diff --git a/src/video_core/texture_cache/types.h b/src/video_core/texture_cache/types.h index 0453456b4..a0e10643f 100644 --- a/src/video_core/texture_cache/types.h +++ b/src/video_core/texture_cache/types.h @@ -54,6 +54,7 @@ enum class RelaxedOptions : u32 { Format = 1 << 1, Samples = 1 << 2, ForceBrokenViews = 1 << 3, + FormatBpp = 1 << 4, }; DECLARE_ENUM_FLAG_OPERATORS(RelaxedOptions) diff --git a/src/video_core/texture_cache/util.cpp b/src/video_core/texture_cache/util.cpp index 697f86641..de37db684 100644 --- a/src/video_core/texture_cache/util.cpp +++ b/src/video_core/texture_cache/util.cpp @@ -743,6 +743,44 @@ std::vector<ImageCopy> MakeShrinkImageCopies(const ImageInfo& dst, const ImageIn return copies; } +std::vector<ImageCopy> MakeReinterpretImageCopies(const ImageInfo& src, u32 up_scale, + u32 down_shift) { + std::vector<ImageCopy> copies; + copies.reserve(src.resources.levels); + const bool is_3d = src.type == ImageType::e3D; + for (s32 level = 0; level < src.resources.levels; ++level) { + ImageCopy& copy = copies.emplace_back(); + copy.src_subresource = SubresourceLayers{ + .base_level = level, + .base_layer = 0, + .num_layers = src.resources.layers, + }; + copy.dst_subresource = SubresourceLayers{ + .base_level = level, + .base_layer = 0, + .num_layers = src.resources.layers, + }; + copy.src_offset = Offset3D{ + .x = 0, + .y = 0, + .z = 0, + }; + copy.dst_offset = Offset3D{ + .x = 0, + .y = 0, + .z = 0, + }; + const Extent3D mip_size = AdjustMipSize(src.size, level); + copy.extent = AdjustSamplesSize(mip_size, src.num_samples); + if (is_3d) { + copy.extent.depth = src.size.depth; + } + copy.extent.width = std::max<u32>((copy.extent.width * up_scale) >> down_shift, 1); + copy.extent.height = std::max<u32>((copy.extent.height * up_scale) >> down_shift, 1); + } + return copies; +} + bool IsValidEntry(const Tegra::MemoryManager& gpu_memory, const TICEntry& config) { const GPUVAddr address = config.Address(); if (address == 0) { @@ -999,6 +1037,20 @@ bool IsBlockLinearSizeCompatible(const ImageInfo& lhs, const ImageInfo& rhs, u32 } } +bool IsBlockLinearSizeCompatibleBPPRelaxed(const ImageInfo& lhs, const ImageInfo& rhs, + u32 lhs_level, u32 rhs_level) noexcept { + ASSERT(lhs.type != ImageType::Linear); + ASSERT(rhs.type != ImageType::Linear); + const auto lhs_bpp = BytesPerBlock(lhs.format); + const auto rhs_bpp = BytesPerBlock(rhs.format); + const Extent3D lhs_size = AdjustMipSize(lhs.size, lhs_level); + const Extent3D rhs_size = AdjustMipSize(rhs.size, rhs_level); + return Common::AlignUpLog2(lhs_size.width * lhs_bpp, GOB_SIZE_X_SHIFT) == + Common::AlignUpLog2(rhs_size.width * rhs_bpp, GOB_SIZE_X_SHIFT) && + Common::AlignUpLog2(lhs_size.height, GOB_SIZE_Y_SHIFT) == + Common::AlignUpLog2(rhs_size.height, GOB_SIZE_Y_SHIFT); +} + bool IsPitchLinearSameSize(const ImageInfo& lhs, const ImageInfo& rhs, bool strict_size) noexcept { ASSERT(lhs.type == ImageType::Linear); ASSERT(rhs.type == ImageType::Linear); @@ -1073,7 +1125,8 @@ std::optional<SubresourceBase> FindSubresource(const ImageInfo& candidate, const // Format checking is relaxed, but we still have to check for matching bytes per block. // This avoids creating a view for blits on UE4 titles where formats with different bytes // per block are aliased. - if (BytesPerBlock(existing.format) != BytesPerBlock(candidate.format)) { + if (BytesPerBlock(existing.format) != BytesPerBlock(candidate.format) && + False(options & RelaxedOptions::FormatBpp)) { return std::nullopt; } } else { @@ -1088,10 +1141,8 @@ std::optional<SubresourceBase> FindSubresource(const ImageInfo& candidate, const if (existing.type != candidate.type) { return std::nullopt; } - if (False(options & RelaxedOptions::Samples)) { - if (existing.num_samples != candidate.num_samples) { - return std::nullopt; - } + if (False(options & RelaxedOptions::Samples) && existing.num_samples != candidate.num_samples) { + return std::nullopt; } if (existing.resources.levels < candidate.resources.levels + base->level) { return std::nullopt; @@ -1101,14 +1152,16 @@ std::optional<SubresourceBase> FindSubresource(const ImageInfo& candidate, const if (mip_depth < candidate.size.depth + base->layer) { return std::nullopt; } - } else { - if (existing.resources.layers < candidate.resources.layers + base->layer) { - return std::nullopt; - } + } else if (existing.resources.layers < candidate.resources.layers + base->layer) { + return std::nullopt; } const bool strict_size = False(options & RelaxedOptions::Size); if (!IsBlockLinearSizeCompatible(existing, candidate, base->level, 0, strict_size)) { - return std::nullopt; + if (False(options & RelaxedOptions::FormatBpp)) { + return std::nullopt; + } else if (!IsBlockLinearSizeCompatibleBPPRelaxed(existing, candidate, base->level, 0)) { + return std::nullopt; + } } // TODO: compare block sizes return base; @@ -1120,6 +1173,31 @@ bool IsSubresource(const ImageInfo& candidate, const ImageBase& image, GPUVAddr .has_value(); } +bool IsSubCopy(const ImageInfo& candidate, const ImageBase& image, GPUVAddr candidate_addr) { + const std::optional<SubresourceBase> base = image.TryFindBase(candidate_addr); + if (!base) { + return false; + } + const ImageInfo& existing = image.info; + if (existing.resources.levels < candidate.resources.levels + base->level) { + return false; + } + if (existing.type == ImageType::e3D) { + const u32 mip_depth = std::max(1U, existing.size.depth << base->level); + if (mip_depth < candidate.size.depth + base->layer) { + return false; + } + } else { + if (existing.resources.layers < candidate.resources.layers + base->layer) { + return false; + } + } + if (!IsBlockLinearSizeCompatibleBPPRelaxed(existing, candidate, base->level, 0)) { + return false; + } + return true; +} + void DeduceBlitImages(ImageInfo& dst_info, ImageInfo& src_info, const ImageBase* dst, const ImageBase* src) { const auto original_dst_format = dst_info.format; diff --git a/src/video_core/texture_cache/util.h b/src/video_core/texture_cache/util.h index d103db8ae..84aa6880d 100644 --- a/src/video_core/texture_cache/util.h +++ b/src/video_core/texture_cache/util.h @@ -56,6 +56,10 @@ struct OverlapResult { SubresourceBase base, u32 up_scale = 1, u32 down_shift = 0); +[[nodiscard]] std::vector<ImageCopy> MakeReinterpretImageCopies(const ImageInfo& src, + u32 up_scale = 1, + u32 down_shift = 0); + [[nodiscard]] bool IsValidEntry(const Tegra::MemoryManager& gpu_memory, const TICEntry& config); [[nodiscard]] std::vector<BufferImageCopy> UnswizzleImage(Tegra::MemoryManager& gpu_memory, @@ -88,6 +92,9 @@ void SwizzleImage(Tegra::MemoryManager& gpu_memory, GPUVAddr gpu_addr, const Ima [[nodiscard]] bool IsPitchLinearSameSize(const ImageInfo& lhs, const ImageInfo& rhs, bool strict_size) noexcept; +[[nodiscard]] bool IsBlockLinearSizeCompatibleBPPRelaxed(const ImageInfo& lhs, const ImageInfo& rhs, + u32 lhs_level, u32 rhs_level) noexcept; + [[nodiscard]] std::optional<OverlapResult> ResolveOverlap(const ImageInfo& new_info, GPUVAddr gpu_addr, VAddr cpu_addr, const ImageBase& overlap, @@ -106,6 +113,9 @@ void SwizzleImage(Tegra::MemoryManager& gpu_memory, GPUVAddr gpu_addr, const Ima GPUVAddr candidate_addr, RelaxedOptions options, bool broken_views, bool native_bgr); +[[nodiscard]] bool IsSubCopy(const ImageInfo& candidate, const ImageBase& image, + GPUVAddr candidate_addr); + void DeduceBlitImages(ImageInfo& dst_info, ImageInfo& src_info, const ImageBase* dst, const ImageBase* src); diff --git a/src/video_core/textures/astc.cpp b/src/video_core/textures/astc.cpp index 4381eed1d..a68bc0d77 100644 --- a/src/video_core/textures/astc.cpp +++ b/src/video_core/textures/astc.cpp @@ -1571,7 +1571,7 @@ static void DecompressBlock(std::span<const u8, 16> inBuf, const u32 blockWidth, assert(strm.GetBitsRead() + weightParams.GetPackedBitSize() == 128); // Decode both color data and texel weight data - u32 colorValues[32]; // Four values, two endpoints, four maximum paritions + u32 colorValues[32]; // Four values, two endpoints, four maximum partitions DecodeColorValues(colorValues, colorEndpointData, colorEndpointMode, nPartitions, colorDataBits); diff --git a/src/video_core/textures/texture.h b/src/video_core/textures/texture.h index 7c4553a53..7e5837b20 100644 --- a/src/video_core/textures/texture.h +++ b/src/video_core/textures/texture.h @@ -15,26 +15,26 @@ enum class TextureFormat : u32 { R32G32B32 = 0x02, R16G16B16A16 = 0x03, R32G32 = 0x04, - R32_B24G8 = 0x05, + R32B24G8 = 0x05, ETC2_RGB = 0x06, X8B8G8R8 = 0x07, - A8R8G8B8 = 0x08, + A8B8G8R8 = 0x08, A2B10G10R10 = 0x09, ETC2_RGB_PTA = 0x0a, ETC2_RGBA = 0x0b, R16G16 = 0x0c, - R24G8 = 0x0d, - R8G24 = 0x0e, + G8R24 = 0x0d, + G24R8 = 0x0e, R32 = 0x0f, - BC6H_SFLOAT = 0x10, - BC6H_UFLOAT = 0x11, + BC6H_S16 = 0x10, + BC6H_U16 = 0x11, A4B4G4R4 = 0x12, A5B5G5R1 = 0x13, A1B5G5R5 = 0x14, B5G6R5 = 0x15, B6G5R5 = 0x16, - BC7 = 0x17, - R8G8 = 0x18, + BC7U = 0x17, + G8R8 = 0x18, EAC = 0x19, EACX2 = 0x1a, R16 = 0x1b, @@ -46,33 +46,33 @@ enum class TextureFormat : u32 { B10G11R11 = 0x21, G8B8G8R8 = 0x22, B8G8R8G8 = 0x23, - BC1_RGBA = 0x24, - BC2 = 0x25, - BC3 = 0x26, - BC4 = 0x27, - BC5 = 0x28, - S8D24 = 0x29, - X8D24 = 0x2a, - D24S8 = 0x2b, - X4V4D24__COV4R4V = 0x2c, - X4V4D24__COV8R8V = 0x2d, - V8D24__COV4R12V = 0x2e, - D32 = 0x2f, - D32S8 = 0x30, - X8D24_X20V4S8__COV4R4V = 0x31, - X8D24_X20V4S8__COV8R8V = 0x32, - D32_X20V4X8__COV4R4V = 0x33, - D32_X20V4X8__COV8R8V = 0x34, - D32_X20V4S8__COV4R4V = 0x35, - D32_X20V4S8__COV8R8V = 0x36, - X8D24_X16V8S8__COV4R12V = 0x37, - D32_X16V8X8__COV4R12V = 0x38, - D32_X16V8S8__COV4R12V = 0x39, - D16 = 0x3a, - V8D24__COV8R24V = 0x3b, - X8D24_X16V8S8__COV8R24V = 0x3c, - D32_X16V8X8__COV8R24V = 0x3d, - D32_X16V8S8__COV8R24V = 0x3e, + DXT1 = 0x24, + DXT23 = 0x25, + DXT45 = 0x26, + DXN1 = 0x27, + DXN2 = 0x28, + Z24S8 = 0x29, + X8Z24 = 0x2a, + S8Z24 = 0x2b, + X4V4Z24__COV4R4V = 0x2c, + X4V4Z24__COV8R8V = 0x2d, + V8Z24__COV4R12V = 0x2e, + Z32 = 0x2f, + Z32_X24S8 = 0x30, + X8Z24_X20V4S8__COV4R4V = 0x31, + X8Z24_X20V4S8__COV8R8V = 0x32, + Z32_X20V4X8__COV4R4V = 0x33, + Z32_X20V4X8__COV8R8V = 0x34, + Z32_X20V4S8__COV4R4V = 0x35, + Z32_X20V4S8__COV8R8V = 0x36, + X8Z24_X16V8S8__COV4R12V = 0x37, + Z32_X16V8X8__COV4R12V = 0x38, + Z32_X16V8S8__COV4R12V = 0x39, + Z16 = 0x3a, + V8Z24__COV8R24V = 0x3b, + X8Z24_X16V8S8__COV8R24V = 0x3c, + Z32_X16V8X8__COV8R24V = 0x3d, + Z32_X16V8S8__COV8R24V = 0x3e, ASTC_2D_4X4 = 0x40, ASTC_2D_5X5 = 0x41, ASTC_2D_6X6 = 0x42, diff --git a/src/video_core/vulkan_common/vulkan_device.cpp b/src/video_core/vulkan_common/vulkan_device.cpp index 48f1a3d14..6f288b3f8 100644 --- a/src/video_core/vulkan_common/vulkan_device.cpp +++ b/src/video_core/vulkan_common/vulkan_device.cpp @@ -401,6 +401,12 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR loaded_extensions.erase(VK_EXT_EXTENDED_DYNAMIC_STATE_2_EXTENSION_NAME); } } + if (extensions.extended_dynamic_state3 && is_radv) { + LOG_WARNING(Render_Vulkan, "RADV has broken extendedDynamicState3ColorBlendEquation"); + features.extended_dynamic_state3.extendedDynamicState3ColorBlendEnable = false; + features.extended_dynamic_state3.extendedDynamicState3ColorBlendEquation = false; + dynamic_state3_blending = false; + } if (extensions.vertex_input_dynamic_state && is_radv) { // TODO(ameerj): Blacklist only offending driver versions // TODO(ameerj): Confirm if RDNA1 is affected @@ -417,7 +423,7 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR sets_per_pool = 64; if (is_amd_driver) { - // AMD drivers need a higher amount of Sets per Pool in certain circunstances like in XC2. + // AMD drivers need a higher amount of Sets per Pool in certain circumstances like in XC2. sets_per_pool = 96; // Disable VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT on AMD GCN4 and lower as it is broken. if (!features.shader_float16_int8.shaderFloat16) { diff --git a/src/video_core/vulkan_common/vulkan_device.h b/src/video_core/vulkan_common/vulkan_device.h index 0662a2d9f..41b5da18a 100644 --- a/src/video_core/vulkan_common/vulkan_device.h +++ b/src/video_core/vulkan_common/vulkan_device.h @@ -180,7 +180,7 @@ public: ~Device(); /** - * Returns a format supported by the device for the passed requeriments. + * Returns a format supported by the device for the passed requirements. * @param wanted_format The ideal format to be returned. It may not be the returned format. * @param wanted_usage The usage that must be fulfilled even if the format is not supported. * @param format_type Format type usage. @@ -259,12 +259,12 @@ public: bool ShouldBoostClocks() const; - /// Returns uniform buffer alignment requeriment. + /// Returns uniform buffer alignment requirement. VkDeviceSize GetUniformBufferAlignment() const { return properties.properties.limits.minUniformBufferOffsetAlignment; } - /// Returns storage alignment requeriment. + /// Returns storage alignment requirement. VkDeviceSize GetStorageBufferAlignment() const { return properties.properties.limits.minStorageBufferOffsetAlignment; } @@ -656,7 +656,7 @@ private: bool is_integrated{}; ///< Is GPU an iGPU. bool is_virtual{}; ///< Is GPU a virtual GPU. bool is_non_gpu{}; ///< Is SoftwareRasterizer, FPGA, non-GPU device. - bool has_broken_cube_compatibility{}; ///< Has broken cube compatiblity bit + bool has_broken_cube_compatibility{}; ///< Has broken cube compatibility bit bool has_renderdoc{}; ///< Has RenderDoc attached bool has_nsight_graphics{}; ///< Has Nsight Graphics attached bool supports_d24_depth{}; ///< Supports D24 depth buffers. diff --git a/src/video_core/vulkan_common/vulkan_wrapper.h b/src/video_core/vulkan_common/vulkan_wrapper.h index e86f661cb..4ff328a21 100644 --- a/src/video_core/vulkan_common/vulkan_wrapper.h +++ b/src/video_core/vulkan_common/vulkan_wrapper.h @@ -68,7 +68,7 @@ public: constexpr Span(const Range& range) : ptr{std::data(range)}, num{std::size(range)} {} /// Construct a span from a pointer and a size. - /// This is inteded for subranges. + /// This is intended for subranges. constexpr Span(const T* ptr_, std::size_t num_) noexcept : ptr{ptr_}, num{num_} {} /// Returns the data pointer by the span. @@ -390,11 +390,11 @@ public: Handle(const Handle&) = delete; Handle& operator=(const Handle&) = delete; - /// Construct a handle transfering the ownership from another handle. + /// Construct a handle transferring the ownership from another handle. Handle(Handle&& rhs) noexcept : handle{std::exchange(rhs.handle, nullptr)}, owner{rhs.owner}, dld{rhs.dld} {} - /// Assign the current handle transfering the ownership from another handle. + /// Assign the current handle transferring the ownership from another handle. /// Destroys any previously held object. Handle& operator=(Handle&& rhs) noexcept { Release(); @@ -463,10 +463,10 @@ public: Handle(const Handle&) = delete; Handle& operator=(const Handle&) = delete; - /// Construct a handle transfering ownership from another handle. + /// Construct a handle transferring ownership from another handle. Handle(Handle&& rhs) noexcept : handle{std::exchange(rhs.handle, nullptr)}, dld{rhs.dld} {} - /// Assign the current handle transfering the ownership from another handle. + /// Assign the current handle transferring the ownership from another handle. /// Destroys any previously held object. Handle& operator=(Handle&& rhs) noexcept { Release(); @@ -533,12 +533,12 @@ public: PoolAllocations(const PoolAllocations&) = delete; PoolAllocations& operator=(const PoolAllocations&) = delete; - /// Construct an allocation transfering ownership from another allocation. + /// Construct an allocation transferring ownership from another allocation. PoolAllocations(PoolAllocations&& rhs) noexcept : allocations{std::move(rhs.allocations)}, num{rhs.num}, device{rhs.device}, pool{rhs.pool}, dld{rhs.dld} {} - /// Assign an allocation transfering ownership from another allocation. + /// Assign an allocation transferring ownership from another allocation. PoolAllocations& operator=(PoolAllocations&& rhs) noexcept { allocations = std::move(rhs.allocations); num = rhs.num; |