summaryrefslogtreecommitdiffstats
path: root/src/video_core
diff options
context:
space:
mode:
Diffstat (limited to 'src/video_core')
-rw-r--r--src/video_core/buffer_cache/buffer_cache.h77
-rw-r--r--src/video_core/control/channel_state_cache.h2
-rw-r--r--src/video_core/engines/draw_manager.cpp1
-rw-r--r--src/video_core/engines/maxwell_dma.cpp107
-rw-r--r--src/video_core/engines/maxwell_dma.h88
-rw-r--r--src/video_core/engines/sw_blitter/blitter.cpp2
-rw-r--r--src/video_core/framebuffer_config.h4
-rw-r--r--src/video_core/gpu.cpp2
-rw-r--r--src/video_core/gpu_thread.cpp2
-rw-r--r--src/video_core/host_shaders/astc_decoder.comp2
-rw-r--r--src/video_core/host_shaders/opengl_smaa.glsl2
-rw-r--r--src/video_core/memory_manager.h4
-rw-r--r--src/video_core/query_cache.h2
-rw-r--r--src/video_core/renderer_null/null_rasterizer.h8
-rw-r--r--src/video_core/renderer_opengl/blit_image.cpp3
-rw-r--r--src/video_core/renderer_opengl/gl_fence_manager.cpp4
-rw-r--r--src/video_core/renderer_opengl/gl_graphics_pipeline.cpp5
-rw-r--r--src/video_core/renderer_opengl/gl_rasterizer.cpp48
-rw-r--r--src/video_core/renderer_opengl/gl_rasterizer.h18
-rw-r--r--src/video_core/renderer_opengl/gl_resource_manager.cpp10
-rw-r--r--src/video_core/renderer_opengl/gl_resource_manager.h3
-rw-r--r--src/video_core/renderer_opengl/gl_texture_cache.cpp41
-rw-r--r--src/video_core/renderer_opengl/gl_texture_cache.h6
-rw-r--r--src/video_core/renderer_vulkan/fixed_pipeline_state.cpp6
-rw-r--r--src/video_core/renderer_vulkan/maxwell_to_vk.cpp7
-rw-r--r--src/video_core/renderer_vulkan/vk_buffer_cache.cpp4
-rw-r--r--src/video_core/renderer_vulkan/vk_command_pool.cpp4
-rw-r--r--src/video_core/renderer_vulkan/vk_rasterizer.cpp50
-rw-r--r--src/video_core/renderer_vulkan/vk_rasterizer.h16
-rw-r--r--src/video_core/renderer_vulkan/vk_resource_pool.cpp4
-rw-r--r--src/video_core/renderer_vulkan/vk_scheduler.cpp84
-rw-r--r--src/video_core/renderer_vulkan/vk_scheduler.h6
-rw-r--r--src/video_core/renderer_vulkan/vk_swapchain.cpp2
-rw-r--r--src/video_core/renderer_vulkan/vk_texture_cache.cpp47
-rw-r--r--src/video_core/renderer_vulkan/vk_texture_cache.h6
-rw-r--r--src/video_core/renderer_vulkan/vk_update_descriptor.cpp2
-rw-r--r--src/video_core/texture_cache/format_lookup_table.cpp62
-rw-r--r--src/video_core/texture_cache/image_base.h2
-rw-r--r--src/video_core/texture_cache/image_info.cpp45
-rw-r--r--src/video_core/texture_cache/image_info.h2
-rw-r--r--src/video_core/texture_cache/texture_cache.h119
-rw-r--r--src/video_core/texture_cache/texture_cache_base.h11
-rw-r--r--src/video_core/texture_cache/types.h1
-rw-r--r--src/video_core/texture_cache/util.cpp98
-rw-r--r--src/video_core/texture_cache/util.h10
-rw-r--r--src/video_core/textures/astc.cpp2
-rw-r--r--src/video_core/textures/texture.h70
-rw-r--r--src/video_core/vulkan_common/vulkan_device.cpp8
-rw-r--r--src/video_core/vulkan_common/vulkan_device.h8
-rw-r--r--src/video_core/vulkan_common/vulkan_wrapper.h14
50 files changed, 852 insertions, 279 deletions
diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h
index 06fd40851..1f656ffa8 100644
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@@ -55,6 +55,19 @@ constexpr u32 NUM_STORAGE_BUFFERS = 16;
constexpr u32 NUM_TEXTURE_BUFFERS = 16;
constexpr u32 NUM_STAGES = 5;
+enum class ObtainBufferSynchronize : u32 {
+ NoSynchronize = 0,
+ FullSynchronize = 1,
+ SynchronizeNoDirty = 2,
+};
+
+enum class ObtainBufferOperation : u32 {
+ DoNothing = 0,
+ MarkAsWritten = 1,
+ DiscardWrite = 2,
+ MarkQuery = 3,
+};
+
using UniformBufferSizes = std::array<std::array<u32, NUM_GRAPHICS_UNIFORM_BUFFERS>, NUM_STAGES>;
using ComputeUniformBufferSizes = std::array<u32, NUM_COMPUTE_UNIFORM_BUFFERS>;
@@ -191,6 +204,10 @@ public:
bool DMAClear(GPUVAddr src_address, u64 amount, u32 value);
+ [[nodiscard]] std::pair<Buffer*, u32> ObtainBuffer(GPUVAddr gpu_addr, u32 size,
+ ObtainBufferSynchronize sync_info,
+ ObtainBufferOperation post_op);
+
/// Return true when a CPU region is modified from the GPU
[[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size);
@@ -366,7 +383,8 @@ private:
void NotifyBufferDeletion();
- [[nodiscard]] Binding StorageBufferBinding(GPUVAddr ssbo_addr, bool is_written = false) const;
+ [[nodiscard]] Binding StorageBufferBinding(GPUVAddr ssbo_addr, u32 cbuf_index,
+ bool is_written = false) const;
[[nodiscard]] TextureBufferBinding GetTextureBufferBinding(GPUVAddr gpu_addr, u32 size,
PixelFormat format);
@@ -642,6 +660,42 @@ bool BufferCache<P>::DMAClear(GPUVAddr dst_address, u64 amount, u32 value) {
}
template <class P>
+std::pair<typename P::Buffer*, u32> BufferCache<P>::ObtainBuffer(GPUVAddr gpu_addr, u32 size,
+ ObtainBufferSynchronize sync_info,
+ ObtainBufferOperation post_op) {
+ const std::optional<VAddr> cpu_addr = gpu_memory->GpuToCpuAddress(gpu_addr);
+ if (!cpu_addr) {
+ return {&slot_buffers[NULL_BUFFER_ID], 0};
+ }
+ const BufferId buffer_id = FindBuffer(*cpu_addr, size);
+ Buffer& buffer = slot_buffers[buffer_id];
+
+ // synchronize op
+ switch (sync_info) {
+ case ObtainBufferSynchronize::FullSynchronize:
+ SynchronizeBuffer(buffer, *cpu_addr, size);
+ break;
+ default:
+ break;
+ }
+
+ switch (post_op) {
+ case ObtainBufferOperation::MarkAsWritten:
+ MarkWrittenBuffer(buffer_id, *cpu_addr, size);
+ break;
+ case ObtainBufferOperation::DiscardWrite: {
+ IntervalType interval{*cpu_addr, size};
+ ClearDownload(interval);
+ break;
+ }
+ default:
+ break;
+ }
+
+ return {&buffer, buffer.Offset(*cpu_addr)};
+}
+
+template <class P>
void BufferCache<P>::BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr,
u32 size) {
const std::optional<VAddr> cpu_addr = gpu_memory->GpuToCpuAddress(gpu_addr);
@@ -749,7 +803,7 @@ void BufferCache<P>::BindGraphicsStorageBuffer(size_t stage, size_t ssbo_index,
const auto& cbufs = maxwell3d->state.shader_stages[stage];
const GPUVAddr ssbo_addr = cbufs.const_buffers[cbuf_index].address + cbuf_offset;
- storage_buffers[stage][ssbo_index] = StorageBufferBinding(ssbo_addr, is_written);
+ storage_buffers[stage][ssbo_index] = StorageBufferBinding(ssbo_addr, cbuf_index, is_written);
}
template <class P>
@@ -789,7 +843,7 @@ void BufferCache<P>::BindComputeStorageBuffer(size_t ssbo_index, u32 cbuf_index,
const auto& cbufs = launch_desc.const_buffer_config;
const GPUVAddr ssbo_addr = cbufs[cbuf_index].Address() + cbuf_offset;
- compute_storage_buffers[ssbo_index] = StorageBufferBinding(ssbo_addr, is_written);
+ compute_storage_buffers[ssbo_index] = StorageBufferBinding(ssbo_addr, cbuf_index, is_written);
}
template <class P>
@@ -1935,11 +1989,26 @@ void BufferCache<P>::NotifyBufferDeletion() {
template <class P>
typename BufferCache<P>::Binding BufferCache<P>::StorageBufferBinding(GPUVAddr ssbo_addr,
+ u32 cbuf_index,
bool is_written) const {
const GPUVAddr gpu_addr = gpu_memory->Read<u64>(ssbo_addr);
- const u32 size = gpu_memory->Read<u32>(ssbo_addr + 8);
+ const auto size = [&]() {
+ const bool is_nvn_cbuf = cbuf_index == 0;
+ // The NVN driver buffer (index 0) is known to pack the SSBO address followed by its size.
+ if (is_nvn_cbuf) {
+ return gpu_memory->Read<u32>(ssbo_addr + 8);
+ }
+ // Other titles (notably Doom Eternal) may use STG/LDG on buffer addresses in custom defined
+ // cbufs, which do not store the sizes adjacent to the addresses, so use the fully
+ // mapped buffer size for now.
+ const u32 memory_layout_size = static_cast<u32>(gpu_memory->GetMemoryLayoutSize(gpu_addr));
+ LOG_INFO(HW_GPU, "Binding storage buffer for cbuf index {}, MemoryLayoutSize 0x{:X}",
+ cbuf_index, memory_layout_size);
+ return memory_layout_size;
+ }();
const std::optional<VAddr> cpu_addr = gpu_memory->GpuToCpuAddress(gpu_addr);
if (!cpu_addr || size == 0) {
+ LOG_WARNING(HW_GPU, "Failed to find storage buffer for cbuf index {}", cbuf_index);
return NULL_BINDING;
}
const VAddr cpu_end = Common::AlignUp(*cpu_addr + size, Core::Memory::YUZU_PAGESIZE);
diff --git a/src/video_core/control/channel_state_cache.h b/src/video_core/control/channel_state_cache.h
index cdaf4f8d5..46bc9e322 100644
--- a/src/video_core/control/channel_state_cache.h
+++ b/src/video_core/control/channel_state_cache.h
@@ -44,7 +44,7 @@ public:
template <class P>
class ChannelSetupCaches {
public:
- /// Operations for seting the channel of execution.
+ /// Operations for setting the channel of execution.
virtual ~ChannelSetupCaches();
/// Create channel state.
diff --git a/src/video_core/engines/draw_manager.cpp b/src/video_core/engines/draw_manager.cpp
index 1d22d25f1..0e94c521a 100644
--- a/src/video_core/engines/draw_manager.cpp
+++ b/src/video_core/engines/draw_manager.cpp
@@ -164,6 +164,7 @@ void DrawManager::DrawEnd(u32 instance_count, bool force_draw) {
draw_state.index_buffer.count =
static_cast<u32>(draw_state.inline_index_draw_indexes.size() / 4);
draw_state.index_buffer.format = Maxwell3D::Regs::IndexFormat::UnsignedInt;
+ maxwell3d->dirty.flags[VideoCommon::Dirty::IndexBuffer] = true;
ProcessDraw(true, instance_count);
draw_state.inline_index_draw_indexes.clear();
break;
diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp
index 7762c7d96..e68850dc5 100644
--- a/src/video_core/engines/maxwell_dma.cpp
+++ b/src/video_core/engines/maxwell_dma.cpp
@@ -14,7 +14,13 @@
#include "video_core/textures/decoders.h"
MICROPROFILE_DECLARE(GPU_DMAEngine);
+MICROPROFILE_DECLARE(GPU_DMAEngineBL);
+MICROPROFILE_DECLARE(GPU_DMAEngineLB);
+MICROPROFILE_DECLARE(GPU_DMAEngineBB);
MICROPROFILE_DEFINE(GPU_DMAEngine, "GPU", "DMA Engine", MP_RGB(224, 224, 128));
+MICROPROFILE_DEFINE(GPU_DMAEngineBL, "GPU", "DMA Engine Block - Linear", MP_RGB(224, 224, 128));
+MICROPROFILE_DEFINE(GPU_DMAEngineLB, "GPU", "DMA Engine Linear - Block", MP_RGB(224, 224, 128));
+MICROPROFILE_DEFINE(GPU_DMAEngineBB, "GPU", "DMA Engine Block - Block", MP_RGB(224, 224, 128));
namespace Tegra::Engines {
@@ -72,6 +78,7 @@ void MaxwellDMA::Launch() {
memory_manager.FlushCaching();
if (!is_src_pitch && !is_dst_pitch) {
// If both the source and the destination are in block layout, assert.
+ MICROPROFILE_SCOPE(GPU_DMAEngineBB);
CopyBlockLinearToBlockLinear();
ReleaseSemaphore();
return;
@@ -87,8 +94,10 @@ void MaxwellDMA::Launch() {
}
} else {
if (!is_src_pitch && is_dst_pitch) {
+ MICROPROFILE_SCOPE(GPU_DMAEngineBL);
CopyBlockLinearToPitch();
} else {
+ MICROPROFILE_SCOPE(GPU_DMAEngineLB);
CopyPitchToBlockLinear();
}
}
@@ -153,21 +162,35 @@ void MaxwellDMA::Launch() {
}
void MaxwellDMA::CopyBlockLinearToPitch() {
- UNIMPLEMENTED_IF(regs.src_params.block_size.width != 0);
- UNIMPLEMENTED_IF(regs.src_params.layer != 0);
-
- const bool is_remapping = regs.launch_dma.remap_enable != 0;
-
- // Optimized path for micro copies.
- const size_t dst_size = static_cast<size_t>(regs.pitch_out) * regs.line_count;
- if (!is_remapping && dst_size < GOB_SIZE && regs.pitch_out <= GOB_SIZE_X &&
- regs.src_params.height > GOB_SIZE_Y) {
- FastCopyBlockLinearToPitch();
+ UNIMPLEMENTED_IF(regs.launch_dma.remap_enable != 0);
+
+ u32 bytes_per_pixel = 1;
+ DMA::ImageOperand src_operand;
+ src_operand.bytes_per_pixel = bytes_per_pixel;
+ src_operand.params = regs.src_params;
+ src_operand.address = regs.offset_in;
+
+ DMA::BufferOperand dst_operand;
+ dst_operand.pitch = regs.pitch_out;
+ dst_operand.width = regs.line_length_in;
+ dst_operand.height = regs.line_count;
+ dst_operand.address = regs.offset_out;
+ DMA::ImageCopy copy_info{};
+ copy_info.length_x = regs.line_length_in;
+ copy_info.length_y = regs.line_count;
+ auto& accelerate = rasterizer->AccessAccelerateDMA();
+ if (accelerate.ImageToBuffer(copy_info, src_operand, dst_operand)) {
return;
}
+ UNIMPLEMENTED_IF(regs.src_params.block_size.width != 0);
+ UNIMPLEMENTED_IF(regs.src_params.block_size.depth != 0);
+ UNIMPLEMENTED_IF(regs.src_params.block_size.depth == 0 && regs.src_params.depth != 1);
+
// Deswizzle the input and copy it over.
- const Parameters& src_params = regs.src_params;
+ const DMA::Parameters& src_params = regs.src_params;
+
+ const bool is_remapping = regs.launch_dma.remap_enable != 0;
const u32 num_remap_components = regs.remap_const.num_dst_components_minus_one + 1;
const u32 remap_components_size = regs.remap_const.component_size_minus_one + 1;
@@ -187,7 +210,7 @@ void MaxwellDMA::CopyBlockLinearToPitch() {
x_offset >>= bpp_shift;
}
- const u32 bytes_per_pixel = base_bpp << bpp_shift;
+ bytes_per_pixel = base_bpp << bpp_shift;
const u32 height = src_params.height;
const u32 depth = src_params.depth;
const u32 block_height = src_params.block_size.height;
@@ -195,11 +218,12 @@ void MaxwellDMA::CopyBlockLinearToPitch() {
const size_t src_size =
CalculateSize(true, bytes_per_pixel, width, height, depth, block_height, block_depth);
+ const size_t dst_size = static_cast<size_t>(regs.pitch_out) * regs.line_count;
read_buffer.resize_destructive(src_size);
write_buffer.resize_destructive(dst_size);
- memory_manager.ReadBlock(regs.offset_in, read_buffer.data(), src_size);
- memory_manager.ReadBlock(regs.offset_out, write_buffer.data(), dst_size);
+ memory_manager.ReadBlock(src_operand.address, read_buffer.data(), src_size);
+ memory_manager.ReadBlockUnsafe(dst_operand.address, write_buffer.data(), dst_size);
UnswizzleSubrect(write_buffer, read_buffer, bytes_per_pixel, width, height, depth, x_offset,
src_params.origin.y, x_elements, regs.line_count, block_height, block_depth,
@@ -216,6 +240,24 @@ void MaxwellDMA::CopyPitchToBlockLinear() {
const u32 num_remap_components = regs.remap_const.num_dst_components_minus_one + 1;
const u32 remap_components_size = regs.remap_const.component_size_minus_one + 1;
+ u32 bytes_per_pixel = 1;
+ DMA::ImageOperand dst_operand;
+ dst_operand.bytes_per_pixel = bytes_per_pixel;
+ dst_operand.params = regs.dst_params;
+ dst_operand.address = regs.offset_out;
+ DMA::BufferOperand src_operand;
+ src_operand.pitch = regs.pitch_in;
+ src_operand.width = regs.line_length_in;
+ src_operand.height = regs.line_count;
+ src_operand.address = regs.offset_in;
+ DMA::ImageCopy copy_info{};
+ copy_info.length_x = regs.line_length_in;
+ copy_info.length_y = regs.line_count;
+ auto& accelerate = rasterizer->AccessAccelerateDMA();
+ if (accelerate.BufferToImage(copy_info, src_operand, dst_operand)) {
+ return;
+ }
+
const auto& dst_params = regs.dst_params;
const u32 base_bpp = !is_remapping ? 1U : num_remap_components * remap_components_size;
@@ -233,7 +275,7 @@ void MaxwellDMA::CopyPitchToBlockLinear() {
x_offset >>= bpp_shift;
}
- const u32 bytes_per_pixel = base_bpp << bpp_shift;
+ bytes_per_pixel = base_bpp << bpp_shift;
const u32 height = dst_params.height;
const u32 depth = dst_params.depth;
const u32 block_height = dst_params.block_size.height;
@@ -260,45 +302,14 @@ void MaxwellDMA::CopyPitchToBlockLinear() {
memory_manager.WriteBlockCached(regs.offset_out, write_buffer.data(), dst_size);
}
-void MaxwellDMA::FastCopyBlockLinearToPitch() {
- const u32 bytes_per_pixel = 1U;
- const size_t src_size = GOB_SIZE;
- const size_t dst_size = static_cast<size_t>(regs.pitch_out) * regs.line_count;
- u32 pos_x = regs.src_params.origin.x;
- u32 pos_y = regs.src_params.origin.y;
- const u64 offset = GetGOBOffset(regs.src_params.width, regs.src_params.height, pos_x, pos_y,
- regs.src_params.block_size.height, bytes_per_pixel);
- const u32 x_in_gob = 64 / bytes_per_pixel;
- pos_x = pos_x % x_in_gob;
- pos_y = pos_y % 8;
-
- read_buffer.resize_destructive(src_size);
- write_buffer.resize_destructive(dst_size);
-
- if (Settings::IsGPULevelExtreme()) {
- memory_manager.ReadBlock(regs.offset_in + offset, read_buffer.data(), src_size);
- memory_manager.ReadBlock(regs.offset_out, write_buffer.data(), dst_size);
- } else {
- memory_manager.ReadBlockUnsafe(regs.offset_in + offset, read_buffer.data(), src_size);
- memory_manager.ReadBlockUnsafe(regs.offset_out, write_buffer.data(), dst_size);
- }
-
- UnswizzleSubrect(write_buffer, read_buffer, bytes_per_pixel, regs.src_params.width,
- regs.src_params.height, 1, pos_x, pos_y, regs.line_length_in, regs.line_count,
- regs.src_params.block_size.height, regs.src_params.block_size.depth,
- regs.pitch_out);
-
- memory_manager.WriteBlockCached(regs.offset_out, write_buffer.data(), dst_size);
-}
-
void MaxwellDMA::CopyBlockLinearToBlockLinear() {
UNIMPLEMENTED_IF(regs.src_params.block_size.width != 0);
const bool is_remapping = regs.launch_dma.remap_enable != 0;
// Deswizzle the input and copy it over.
- const Parameters& src = regs.src_params;
- const Parameters& dst = regs.dst_params;
+ const DMA::Parameters& src = regs.src_params;
+ const DMA::Parameters& dst = regs.dst_params;
const u32 num_remap_components = regs.remap_const.num_dst_components_minus_one + 1;
const u32 remap_components_size = regs.remap_const.component_size_minus_one + 1;
diff --git a/src/video_core/engines/maxwell_dma.h b/src/video_core/engines/maxwell_dma.h
index 0e594fa74..69e26cb32 100644
--- a/src/video_core/engines/maxwell_dma.h
+++ b/src/video_core/engines/maxwell_dma.h
@@ -24,6 +24,54 @@ namespace VideoCore {
class RasterizerInterface;
}
+namespace Tegra {
+namespace DMA {
+
+union Origin {
+ BitField<0, 16, u32> x;
+ BitField<16, 16, u32> y;
+};
+static_assert(sizeof(Origin) == 4);
+
+struct ImageCopy {
+ u32 length_x{};
+ u32 length_y{};
+};
+
+union BlockSize {
+ BitField<0, 4, u32> width;
+ BitField<4, 4, u32> height;
+ BitField<8, 4, u32> depth;
+ BitField<12, 4, u32> gob_height;
+};
+static_assert(sizeof(BlockSize) == 4);
+
+struct Parameters {
+ BlockSize block_size;
+ u32 width;
+ u32 height;
+ u32 depth;
+ u32 layer;
+ Origin origin;
+};
+static_assert(sizeof(Parameters) == 24);
+
+struct ImageOperand {
+ u32 bytes_per_pixel;
+ Parameters params;
+ GPUVAddr address;
+};
+
+struct BufferOperand {
+ u32 pitch;
+ u32 width;
+ u32 height;
+ GPUVAddr address;
+};
+
+} // namespace DMA
+} // namespace Tegra
+
namespace Tegra::Engines {
class AccelerateDMAInterface {
@@ -32,6 +80,12 @@ public:
virtual bool BufferCopy(GPUVAddr src_address, GPUVAddr dest_address, u64 amount) = 0;
virtual bool BufferClear(GPUVAddr src_address, u64 amount, u32 value) = 0;
+
+ virtual bool ImageToBuffer(const DMA::ImageCopy& copy_info, const DMA::ImageOperand& src,
+ const DMA::BufferOperand& dst) = 0;
+
+ virtual bool BufferToImage(const DMA::ImageCopy& copy_info, const DMA::BufferOperand& src,
+ const DMA::ImageOperand& dst) = 0;
};
/**
@@ -51,30 +105,6 @@ public:
}
};
- union BlockSize {
- BitField<0, 4, u32> width;
- BitField<4, 4, u32> height;
- BitField<8, 4, u32> depth;
- BitField<12, 4, u32> gob_height;
- };
- static_assert(sizeof(BlockSize) == 4);
-
- union Origin {
- BitField<0, 16, u32> x;
- BitField<16, 16, u32> y;
- };
- static_assert(sizeof(Origin) == 4);
-
- struct Parameters {
- BlockSize block_size;
- u32 width;
- u32 height;
- u32 depth;
- u32 layer;
- Origin origin;
- };
- static_assert(sizeof(Parameters) == 24);
-
struct Semaphore {
PackedGPUVAddr address;
u32 payload;
@@ -227,8 +257,6 @@ private:
void CopyBlockLinearToBlockLinear();
- void FastCopyBlockLinearToPitch();
-
void ReleaseSemaphore();
void ConsumeSinkImpl() override;
@@ -261,17 +289,17 @@ private:
u32 reserved05[0x3f];
PackedGPUVAddr offset_in;
PackedGPUVAddr offset_out;
- u32 pitch_in;
- u32 pitch_out;
+ s32 pitch_in;
+ s32 pitch_out;
u32 line_length_in;
u32 line_count;
u32 reserved06[0xb6];
u32 remap_consta_value;
u32 remap_constb_value;
RemapConst remap_const;
- Parameters dst_params;
+ DMA::Parameters dst_params;
u32 reserved07[0x1];
- Parameters src_params;
+ DMA::Parameters src_params;
u32 reserved08[0x275];
u32 pm_trigger_end;
u32 reserved09[0x3ba];
diff --git a/src/video_core/engines/sw_blitter/blitter.cpp b/src/video_core/engines/sw_blitter/blitter.cpp
index 2f1ea4626..3c9f38559 100644
--- a/src/video_core/engines/sw_blitter/blitter.cpp
+++ b/src/video_core/engines/sw_blitter/blitter.cpp
@@ -193,7 +193,7 @@ bool SoftwareBlitEngine::Blit(Fermi2D::Surface& src, Fermi2D::Surface& dst,
output_converter->ConvertFrom(impl->intermediate_dst, impl->dst_buffer);
};
- // Do actuall Blit
+ // Do actual Blit
impl->dst_buffer.resize(dst_copy_size);
if (src.linear == Fermi2D::MemoryLayout::BlockLinear) {
diff --git a/src/video_core/framebuffer_config.h b/src/video_core/framebuffer_config.h
index d93f5a37f..5f3bffcab 100644
--- a/src/video_core/framebuffer_config.h
+++ b/src/video_core/framebuffer_config.h
@@ -5,8 +5,8 @@
#include "common/common_types.h"
#include "common/math_util.h"
-#include "core/hle/service/nvflinger/buffer_transform_flags.h"
-#include "core/hle/service/nvflinger/pixel_format.h"
+#include "core/hle/service/nvnflinger/buffer_transform_flags.h"
+#include "core/hle/service/nvnflinger/pixel_format.h"
namespace Tegra {
diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp
index 7024a19cf..2e7f9c5ed 100644
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -197,7 +197,7 @@ struct GPU::Impl {
constexpr u64 gpu_ticks_num = 384;
constexpr u64 gpu_ticks_den = 625;
- u64 nanoseconds = system.CoreTiming().GetGlobalTimeNs().count();
+ u64 nanoseconds = system.CoreTiming().GetCPUTimeNs().count();
if (Settings::values.use_fast_gpu_time.GetValue()) {
nanoseconds /= 256;
}
diff --git a/src/video_core/gpu_thread.cpp b/src/video_core/gpu_thread.cpp
index 7cc5647e9..f52f9e28f 100644
--- a/src/video_core/gpu_thread.cpp
+++ b/src/video_core/gpu_thread.cpp
@@ -25,7 +25,7 @@ static void RunThread(std::stop_token stop_token, Core::System& system,
SCOPE_EXIT({ MicroProfileOnThreadExit(); });
Common::SetCurrentThreadName(name.c_str());
- Common::SetCurrentThreadPriority(Common::ThreadPriority::High);
+ Common::SetCurrentThreadPriority(Common::ThreadPriority::Critical);
system.RegisterHostThread();
auto current_context = context.Acquire();
diff --git a/src/video_core/host_shaders/astc_decoder.comp b/src/video_core/host_shaders/astc_decoder.comp
index d608678a3..bf2693559 100644
--- a/src/video_core/host_shaders/astc_decoder.comp
+++ b/src/video_core/host_shaders/astc_decoder.comp
@@ -125,7 +125,7 @@ uvec4 local_buff;
uvec4 color_endpoint_data;
int color_bitsread = 0;
-// Four values, two endpoints, four maximum paritions
+// Four values, two endpoints, four maximum partitions
uint color_values[32];
int colvals_index = 0;
diff --git a/src/video_core/host_shaders/opengl_smaa.glsl b/src/video_core/host_shaders/opengl_smaa.glsl
index 3cbe87bbf..419f89bca 100644
--- a/src/video_core/host_shaders/opengl_smaa.glsl
+++ b/src/video_core/host_shaders/opengl_smaa.glsl
@@ -97,7 +97,7 @@
* half-rate linear filtering on GCN.
*
* If SMAA is applied to 64-bit color buffers, switching to point filtering
- * when accesing them will increase the performance. Search for
+ * when accessing them will increase the performance. Search for
* 'SMAASamplePoint' to see which textures may benefit from point
* filtering, and where (which is basically the color input in the edge
* detection and resolve passes).
diff --git a/src/video_core/memory_manager.h b/src/video_core/memory_manager.h
index cf56392ef..51ae2de68 100644
--- a/src/video_core/memory_manager.h
+++ b/src/video_core/memory_manager.h
@@ -103,8 +103,8 @@ public:
/**
* Returns a vector with all the subranges of cpu addresses mapped beneath.
- * if the region is continous, a single pair will be returned. If it's unmapped, an empty vector
- * will be returned;
+ * if the region is continuous, a single pair will be returned. If it's unmapped, an empty
+ * vector will be returned;
*/
std::vector<std::pair<GPUVAddr, std::size_t>> GetSubmappedRange(GPUVAddr gpu_addr,
std::size_t size) const;
diff --git a/src/video_core/query_cache.h b/src/video_core/query_cache.h
index 00ce53e3e..8906ba6d8 100644
--- a/src/video_core/query_cache.h
+++ b/src/video_core/query_cache.h
@@ -341,7 +341,7 @@ public:
/// Flushes the query to guest memory.
virtual void Flush() {
- // When counter is nullptr it means that it's just been reseted. We are supposed to write a
+ // When counter is nullptr it means that it's just been reset. We are supposed to write a
// zero in these cases.
const u64 value = counter ? counter->Query() : 0;
std::memcpy(host_ptr, &value, sizeof(u64));
diff --git a/src/video_core/renderer_null/null_rasterizer.h b/src/video_core/renderer_null/null_rasterizer.h
index 51f896e43..0c59e6a1f 100644
--- a/src/video_core/renderer_null/null_rasterizer.h
+++ b/src/video_core/renderer_null/null_rasterizer.h
@@ -22,6 +22,14 @@ public:
explicit AccelerateDMA();
bool BufferCopy(GPUVAddr start_address, GPUVAddr end_address, u64 amount) override;
bool BufferClear(GPUVAddr src_address, u64 amount, u32 value) override;
+ bool ImageToBuffer(const Tegra::DMA::ImageCopy& copy_info, const Tegra::DMA::ImageOperand& src,
+ const Tegra::DMA::BufferOperand& dst) override {
+ return false;
+ }
+ bool BufferToImage(const Tegra::DMA::ImageCopy& copy_info, const Tegra::DMA::BufferOperand& src,
+ const Tegra::DMA::ImageOperand& dst) override {
+ return false;
+ }
};
class RasterizerNull final : public VideoCore::RasterizerAccelerated,
diff --git a/src/video_core/renderer_opengl/blit_image.cpp b/src/video_core/renderer_opengl/blit_image.cpp
index 9a560a73b..3b03e8d5a 100644
--- a/src/video_core/renderer_opengl/blit_image.cpp
+++ b/src/video_core/renderer_opengl/blit_image.cpp
@@ -22,7 +22,7 @@ BlitImageHelper::~BlitImageHelper() = default;
void BlitImageHelper::BlitColor(GLuint dst_framebuffer, GLuint src_image_view, GLuint src_sampler,
const Region2D& dst_region, const Region2D& src_region,
const Extent3D& src_size) {
- glEnable(GL_CULL_FACE);
+ glDisable(GL_CULL_FACE);
glDisable(GL_COLOR_LOGIC_OP);
glDisable(GL_DEPTH_TEST);
glDisable(GL_STENCIL_TEST);
@@ -31,7 +31,6 @@ void BlitImageHelper::BlitColor(GLuint dst_framebuffer, GLuint src_image_view, G
glDisable(GL_ALPHA_TEST);
glDisablei(GL_BLEND, 0);
glPolygonMode(GL_FRONT_AND_BACK, GL_FILL);
- glCullFace(GL_BACK);
glFrontFace(GL_CW);
glColorMaski(0, GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE);
glDepthRangeIndexed(0, 0.0, 0.0);
diff --git a/src/video_core/renderer_opengl/gl_fence_manager.cpp b/src/video_core/renderer_opengl/gl_fence_manager.cpp
index 91463f854..5326172af 100644
--- a/src/video_core/renderer_opengl/gl_fence_manager.cpp
+++ b/src/video_core/renderer_opengl/gl_fence_manager.cpp
@@ -27,9 +27,7 @@ bool GLInnerFence::IsSignaled() const {
return true;
}
ASSERT(sync_object.handle != 0);
- GLint sync_status;
- glGetSynciv(sync_object.handle, GL_SYNC_STATUS, 1, nullptr, &sync_status);
- return sync_status == GL_SIGNALED;
+ return sync_object.IsSignaled();
}
void GLInnerFence::Wait() {
diff --git a/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp b/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp
index 29491e762..89000d6e0 100644
--- a/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp
+++ b/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp
@@ -621,10 +621,7 @@ bool GraphicsPipeline::IsBuilt() noexcept {
if (built_fence.handle == 0) {
return false;
}
- // Timeout of zero means this is non-blocking
- const auto sync_status = glClientWaitSync(built_fence.handle, 0, 0);
- ASSERT(sync_status != GL_WAIT_FAILED);
- is_built = sync_status != GL_TIMEOUT_EXPIRED;
+ is_built = built_fence.IsSignaled();
return is_built;
}
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index 7bced675c..90e35e307 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -63,7 +63,7 @@ RasterizerOpenGL::RasterizerOpenGL(Core::Frontend::EmuWindow& emu_window_, Tegra
buffer_cache(*this, cpu_memory_, buffer_cache_runtime),
shader_cache(*this, emu_window_, device, texture_cache, buffer_cache, program_manager,
state_tracker, gpu.ShaderNotify()),
- query_cache(*this), accelerate_dma(buffer_cache),
+ query_cache(*this), accelerate_dma(buffer_cache, texture_cache),
fence_manager(*this, gpu, texture_cache, buffer_cache, query_cache),
blit_image(program_manager_) {}
@@ -357,6 +357,7 @@ void RasterizerOpenGL::DrawTexture() {
.y = static_cast<s32>(draw_texture_state.src_y1)}};
blit_image.BlitColor(texture_cache.GetFramebuffer()->Handle(), texture.DefaultHandle(),
sampler->Handle(), dst_region, src_region, texture.size);
+ state_tracker.InvalidateState();
}
++num_queued_commands;
@@ -576,7 +577,7 @@ bool RasterizerOpenGL::AccelerateConditionalRendering() {
// Reimplement Host conditional rendering.
return false;
}
- // Medium / Low Hack: stub any checks on queries writen into the buffer cache.
+ // Medium / Low Hack: stub any checks on queries written into the buffer cache.
const GPUVAddr condition_address{maxwell3d->regs.render_enable.Address()};
Maxwell::ReportSemaphore::Compare cmp;
if (gpu_memory->IsMemoryDirty(condition_address, sizeof(cmp),
@@ -1262,7 +1263,8 @@ void RasterizerOpenGL::ReleaseChannel(s32 channel_id) {
query_cache.EraseChannel(channel_id);
}
-AccelerateDMA::AccelerateDMA(BufferCache& buffer_cache_) : buffer_cache{buffer_cache_} {}
+AccelerateDMA::AccelerateDMA(BufferCache& buffer_cache_, TextureCache& texture_cache_)
+ : buffer_cache{buffer_cache_}, texture_cache{texture_cache_} {}
bool AccelerateDMA::BufferCopy(GPUVAddr src_address, GPUVAddr dest_address, u64 amount) {
std::scoped_lock lock{buffer_cache.mutex};
@@ -1274,4 +1276,44 @@ bool AccelerateDMA::BufferClear(GPUVAddr src_address, u64 amount, u32 value) {
return buffer_cache.DMAClear(src_address, amount, value);
}
+template <bool IS_IMAGE_UPLOAD>
+bool AccelerateDMA::DmaBufferImageCopy(const Tegra::DMA::ImageCopy& copy_info,
+ const Tegra::DMA::BufferOperand& buffer_operand,
+ const Tegra::DMA::ImageOperand& image_operand) {
+ std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex};
+ const auto image_id = texture_cache.DmaImageId(image_operand);
+ if (image_id == VideoCommon::NULL_IMAGE_ID) {
+ return false;
+ }
+ const u32 buffer_size = static_cast<u32>(buffer_operand.pitch * buffer_operand.height);
+ static constexpr auto sync_info = VideoCommon::ObtainBufferSynchronize::FullSynchronize;
+ const auto post_op = IS_IMAGE_UPLOAD ? VideoCommon::ObtainBufferOperation::DoNothing
+ : VideoCommon::ObtainBufferOperation::MarkAsWritten;
+ const auto [buffer, offset] =
+ buffer_cache.ObtainBuffer(buffer_operand.address, buffer_size, sync_info, post_op);
+
+ const auto [image, copy] = texture_cache.DmaBufferImageCopy(
+ copy_info, buffer_operand, image_operand, image_id, IS_IMAGE_UPLOAD);
+ const std::span copy_span{&copy, 1};
+
+ if constexpr (IS_IMAGE_UPLOAD) {
+ image->UploadMemory(buffer->Handle(), offset, copy_span);
+ } else {
+ image->DownloadMemory(buffer->Handle(), offset, copy_span);
+ }
+ return true;
+}
+
+bool AccelerateDMA::ImageToBuffer(const Tegra::DMA::ImageCopy& copy_info,
+ const Tegra::DMA::ImageOperand& image_operand,
+ const Tegra::DMA::BufferOperand& buffer_operand) {
+ return DmaBufferImageCopy<false>(copy_info, buffer_operand, image_operand);
+}
+
+bool AccelerateDMA::BufferToImage(const Tegra::DMA::ImageCopy& copy_info,
+ const Tegra::DMA::BufferOperand& buffer_operand,
+ const Tegra::DMA::ImageOperand& image_operand) {
+ return DmaBufferImageCopy<true>(copy_info, buffer_operand, image_operand);
+}
+
} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h
index 0c45832ae..ad6978bd0 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -50,14 +50,26 @@ static_assert(sizeof(BindlessSSBO) * CHAR_BIT == 128);
class AccelerateDMA : public Tegra::Engines::AccelerateDMAInterface {
public:
- explicit AccelerateDMA(BufferCache& buffer_cache);
+ explicit AccelerateDMA(BufferCache& buffer_cache, TextureCache& texture_cache);
bool BufferCopy(GPUVAddr src_address, GPUVAddr dest_address, u64 amount) override;
bool BufferClear(GPUVAddr src_address, u64 amount, u32 value) override;
+ bool ImageToBuffer(const Tegra::DMA::ImageCopy& copy_info, const Tegra::DMA::ImageOperand& src,
+ const Tegra::DMA::BufferOperand& dst) override;
+
+ bool BufferToImage(const Tegra::DMA::ImageCopy& copy_info, const Tegra::DMA::BufferOperand& src,
+ const Tegra::DMA::ImageOperand& dst) override;
+
private:
+ template <bool IS_IMAGE_UPLOAD>
+ bool DmaBufferImageCopy(const Tegra::DMA::ImageCopy& copy_info,
+ const Tegra::DMA::BufferOperand& src,
+ const Tegra::DMA::ImageOperand& dst);
+
BufferCache& buffer_cache;
+ TextureCache& texture_cache;
};
class RasterizerOpenGL : public VideoCore::RasterizerAccelerated,
@@ -150,7 +162,7 @@ private:
/// Syncs the cull mode to match the guest state
void SyncCullMode();
- /// Syncs the primitve restart to match the guest state
+ /// Syncs the primitive restart to match the guest state
void SyncPrimitiveRestart();
/// Syncs the depth test state to match the guest state
@@ -234,7 +246,7 @@ private:
std::array<GLuint, MAX_TEXTURES> texture_handles{};
std::array<GLuint, MAX_IMAGES> image_handles{};
- /// Number of commands queued to the OpenGL driver. Resetted on flush.
+ /// Number of commands queued to the OpenGL driver. Reset on flush.
size_t num_queued_commands = 0;
bool has_written_global_memory = false;
diff --git a/src/video_core/renderer_opengl/gl_resource_manager.cpp b/src/video_core/renderer_opengl/gl_resource_manager.cpp
index 3a664fdec..eae8fd110 100644
--- a/src/video_core/renderer_opengl/gl_resource_manager.cpp
+++ b/src/video_core/renderer_opengl/gl_resource_manager.cpp
@@ -3,6 +3,7 @@
#include <string_view>
#include <glad/glad.h>
+#include "common/assert.h"
#include "common/microprofile.h"
#include "video_core/renderer_opengl/gl_resource_manager.h"
#include "video_core/renderer_opengl/gl_shader_util.h"
@@ -158,6 +159,15 @@ void OGLSync::Release() {
handle = 0;
}
+bool OGLSync::IsSignaled() const noexcept {
+ // At least on Nvidia, glClientWaitSync with a timeout of 0
+ // is faster than glGetSynciv of GL_SYNC_STATUS.
+ // Timeout of 0 means this check is non-blocking.
+ const auto sync_status = glClientWaitSync(handle, 0, 0);
+ ASSERT(sync_status != GL_WAIT_FAILED);
+ return sync_status != GL_TIMEOUT_EXPIRED;
+}
+
void OGLFramebuffer::Create() {
if (handle != 0)
return;
diff --git a/src/video_core/renderer_opengl/gl_resource_manager.h b/src/video_core/renderer_opengl/gl_resource_manager.h
index bc05ba4bd..77362acd2 100644
--- a/src/video_core/renderer_opengl/gl_resource_manager.h
+++ b/src/video_core/renderer_opengl/gl_resource_manager.h
@@ -263,6 +263,9 @@ public:
/// Deletes the internal OpenGL resource
void Release();
+ /// Checks if the sync has been signaled
+ bool IsSignaled() const noexcept;
+
GLsync handle = 0;
};
diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp
index b047e7b3d..0b9c4a904 100644
--- a/src/video_core/renderer_opengl/gl_texture_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp
@@ -112,13 +112,17 @@ GLenum ImageTarget(Shader::TextureType type, int num_samples = 1) {
return GL_NONE;
}
-GLenum TextureMode(PixelFormat format, bool is_first) {
+GLenum TextureMode(PixelFormat format, std::array<SwizzleSource, 4> swizzle) {
+ bool any_r =
+ std::ranges::any_of(swizzle, [](SwizzleSource s) { return s == SwizzleSource::R; });
switch (format) {
case PixelFormat::D24_UNORM_S8_UINT:
case PixelFormat::D32_FLOAT_S8_UINT:
- return is_first ? GL_DEPTH_COMPONENT : GL_STENCIL_INDEX;
+ // R = depth, G = stencil
+ return any_r ? GL_DEPTH_COMPONENT : GL_STENCIL_INDEX;
case PixelFormat::S8_UINT_D24_UNORM:
- return is_first ? GL_STENCIL_INDEX : GL_DEPTH_COMPONENT;
+ // R = stencil, G = depth
+ return any_r ? GL_STENCIL_INDEX : GL_DEPTH_COMPONENT;
default:
ASSERT(false);
return GL_DEPTH_COMPONENT;
@@ -208,8 +212,7 @@ void ApplySwizzle(GLuint handle, PixelFormat format, std::array<SwizzleSource, 4
case PixelFormat::D32_FLOAT_S8_UINT:
case PixelFormat::S8_UINT_D24_UNORM:
UNIMPLEMENTED_IF(swizzle[0] != SwizzleSource::R && swizzle[0] != SwizzleSource::G);
- glTextureParameteri(handle, GL_DEPTH_STENCIL_TEXTURE_MODE,
- TextureMode(format, swizzle[0] == SwizzleSource::R));
+ glTextureParameteri(handle, GL_DEPTH_STENCIL_TEXTURE_MODE, TextureMode(format, swizzle));
std::ranges::transform(swizzle, swizzle.begin(), ConvertGreenRed);
break;
case PixelFormat::A5B5G5R1_UNORM: {
@@ -714,9 +717,7 @@ std::optional<size_t> TextureCacheRuntime::StagingBuffers::FindBuffer(size_t req
continue;
}
if (syncs[index].handle != 0) {
- GLint status;
- glGetSynciv(syncs[index].handle, GL_SYNC_STATUS, 1, nullptr, &status);
- if (status != GL_SIGNALED) {
+ if (!syncs[index].IsSignaled()) {
continue;
}
syncs[index].Release();
@@ -762,14 +763,14 @@ Image::Image(const VideoCommon::NullImageParams& params) : VideoCommon::ImageBas
Image::~Image() = default;
-void Image::UploadMemory(const ImageBufferMap& map,
+void Image::UploadMemory(GLuint buffer_handle, size_t buffer_offset,
std::span<const VideoCommon::BufferImageCopy> copies) {
const bool is_rescaled = True(flags & ImageFlagBits::Rescaled);
if (is_rescaled) {
ScaleDown(true);
}
- glBindBuffer(GL_PIXEL_UNPACK_BUFFER, map.buffer);
- glFlushMappedBufferRange(GL_PIXEL_UNPACK_BUFFER, map.offset, unswizzled_size_bytes);
+ glBindBuffer(GL_PIXEL_UNPACK_BUFFER, buffer_handle);
+ glFlushMappedBufferRange(GL_PIXEL_UNPACK_BUFFER, buffer_offset, unswizzled_size_bytes);
glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
@@ -788,21 +789,26 @@ void Image::UploadMemory(const ImageBufferMap& map,
current_image_height = copy.buffer_image_height;
glPixelStorei(GL_UNPACK_IMAGE_HEIGHT, current_image_height);
}
- CopyBufferToImage(copy, map.offset);
+ CopyBufferToImage(copy, buffer_offset);
}
if (is_rescaled) {
ScaleUp();
}
}
-void Image::DownloadMemory(ImageBufferMap& map,
+void Image::UploadMemory(const ImageBufferMap& map,
+ std::span<const VideoCommon::BufferImageCopy> copies) {
+ UploadMemory(map.buffer, map.offset, copies);
+}
+
+void Image::DownloadMemory(GLuint buffer_handle, size_t buffer_offset,
std::span<const VideoCommon::BufferImageCopy> copies) {
const bool is_rescaled = True(flags & ImageFlagBits::Rescaled);
if (is_rescaled) {
ScaleDown();
}
glMemoryBarrier(GL_PIXEL_BUFFER_BARRIER_BIT); // TODO: Move this to its own API
- glBindBuffer(GL_PIXEL_PACK_BUFFER, map.buffer);
+ glBindBuffer(GL_PIXEL_PACK_BUFFER, buffer_handle);
glPixelStorei(GL_PACK_ALIGNMENT, 1);
u32 current_row_length = std::numeric_limits<u32>::max();
@@ -820,13 +826,18 @@ void Image::DownloadMemory(ImageBufferMap& map,
current_image_height = copy.buffer_image_height;
glPixelStorei(GL_PACK_IMAGE_HEIGHT, current_image_height);
}
- CopyImageToBuffer(copy, map.offset);
+ CopyImageToBuffer(copy, buffer_offset);
}
if (is_rescaled) {
ScaleUp(true);
}
}
+void Image::DownloadMemory(ImageBufferMap& map,
+ std::span<const VideoCommon::BufferImageCopy> copies) {
+ DownloadMemory(map.buffer, map.offset, copies);
+}
+
GLuint Image::StorageHandle() noexcept {
switch (info.format) {
case PixelFormat::A8B8G8R8_SRGB:
diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h
index e30875496..911e4607a 100644
--- a/src/video_core/renderer_opengl/gl_texture_cache.h
+++ b/src/video_core/renderer_opengl/gl_texture_cache.h
@@ -206,9 +206,15 @@ public:
Image(Image&&) = default;
Image& operator=(Image&&) = default;
+ void UploadMemory(GLuint buffer_handle, size_t buffer_offset,
+ std::span<const VideoCommon::BufferImageCopy> copies);
+
void UploadMemory(const ImageBufferMap& map,
std::span<const VideoCommon::BufferImageCopy> copies);
+ void DownloadMemory(GLuint buffer_handle, size_t buffer_offset,
+ std::span<const VideoCommon::BufferImageCopy> copies);
+
void DownloadMemory(ImageBufferMap& map, std::span<const VideoCommon::BufferImageCopy> copies);
GLuint StorageHandle() noexcept;
diff --git a/src/video_core/renderer_vulkan/fixed_pipeline_state.cpp b/src/video_core/renderer_vulkan/fixed_pipeline_state.cpp
index f8398b511..e7df32d84 100644
--- a/src/video_core/renderer_vulkan/fixed_pipeline_state.cpp
+++ b/src/video_core/renderer_vulkan/fixed_pipeline_state.cpp
@@ -271,7 +271,7 @@ bool FixedPipelineState::operator==(const FixedPipelineState& rhs) const noexcep
u32 FixedPipelineState::PackComparisonOp(Maxwell::ComparisonOp op) noexcept {
// OpenGL enums go from 0x200 to 0x207 and the others from 1 to 8
- // If we substract 0x200 to OpenGL enums and 1 to the others we get a 0-7 range.
+ // If we subtract 0x200 to OpenGL enums and 1 to the others we get a 0-7 range.
// Perfect for a hash.
const u32 value = static_cast<u32>(op);
return value - (value >= 0x200 ? 0x200 : 1);
@@ -322,8 +322,8 @@ Maxwell::StencilOp::Op FixedPipelineState::UnpackStencilOp(u32 packed) noexcept
}
u32 FixedPipelineState::PackCullFace(Maxwell::CullFace cull) noexcept {
- // FrontAndBack is 0x408, by substracting 0x406 in it we get 2.
- // Individual cull faces are in 0x404 and 0x405, substracting 0x404 we get 0 and 1.
+ // FrontAndBack is 0x408, by subtracting 0x406 in it we get 2.
+ // Individual cull faces are in 0x404 and 0x405, subtracting 0x404 we get 0 and 1.
const u32 value = static_cast<u32>(cull);
return value - (value == 0x408 ? 0x406 : 0x404);
}
diff --git a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
index ca52e2389..5dce51be8 100644
--- a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
+++ b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
@@ -166,7 +166,7 @@ struct FormatTuple {
{VK_FORMAT_R16G16_UINT, Attachable | Storage}, // R16G16_UINT
{VK_FORMAT_R16G16_SINT, Attachable | Storage}, // R16G16_SINT
{VK_FORMAT_R16G16_SNORM, Attachable | Storage}, // R16G16_SNORM
- {VK_FORMAT_UNDEFINED}, // R32G32B32_FLOAT
+ {VK_FORMAT_R32G32B32_SFLOAT}, // R32G32B32_FLOAT
{VK_FORMAT_A8B8G8R8_SRGB_PACK32, Attachable}, // A8B8G8R8_SRGB
{VK_FORMAT_R8G8_UNORM, Attachable | Storage}, // R8G8_UNORM
{VK_FORMAT_R8G8_SNORM, Attachable | Storage}, // R8G8_SNORM
@@ -234,11 +234,6 @@ FormatInfo SurfaceFormat(const Device& device, FormatType format_type, bool with
PixelFormat pixel_format) {
ASSERT(static_cast<size_t>(pixel_format) < std::size(tex_format_tuples));
FormatTuple tuple = tex_format_tuples[static_cast<size_t>(pixel_format)];
- if (tuple.format == VK_FORMAT_UNDEFINED) {
- UNIMPLEMENTED_MSG("Unimplemented texture format with pixel format={}", pixel_format);
- return FormatInfo{VK_FORMAT_A8B8G8R8_UNORM_PACK32, true, true};
- }
-
// Use A8B8G8R8_UNORM on hardware that doesn't support ASTC natively
if (!device.IsOptimalAstcSupported() && VideoCore::Surface::IsPixelFormatASTC(pixel_format)) {
const bool is_srgb = with_srgb && VideoCore::Surface::IsPixelFormatSRGB(pixel_format);
diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
index b0153a502..9cbcb3c8f 100644
--- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
@@ -238,7 +238,7 @@ private:
return indices;
}
- void MakeAndUpdateIndices(u8* staging_data, size_t quad_size, u32 quad, u32 first) {
+ void MakeAndUpdateIndices(u8* staging_data, size_t quad_size, u32 quad, u32 first) override {
switch (index_type) {
case VK_INDEX_TYPE_UINT8_EXT:
std::memcpy(staging_data, MakeIndices<u8>(quad, first).data(), quad_size);
@@ -278,7 +278,7 @@ private:
return indices;
}
- void MakeAndUpdateIndices(u8* staging_data, size_t quad_size, u32 quad, u32 first) {
+ void MakeAndUpdateIndices(u8* staging_data, size_t quad_size, u32 quad, u32 first) override {
switch (index_type) {
case VK_INDEX_TYPE_UINT8_EXT:
std::memcpy(staging_data, MakeIndices<u8>(quad, first).data(), quad_size);
diff --git a/src/video_core/renderer_vulkan/vk_command_pool.cpp b/src/video_core/renderer_vulkan/vk_command_pool.cpp
index 2f09de1c1..d0dbf7ca5 100644
--- a/src/video_core/renderer_vulkan/vk_command_pool.cpp
+++ b/src/video_core/renderer_vulkan/vk_command_pool.cpp
@@ -22,8 +22,8 @@ CommandPool::CommandPool(MasterSemaphore& master_semaphore_, const Device& devic
CommandPool::~CommandPool() = default;
void CommandPool::Allocate(size_t begin, size_t end) {
- // Command buffers are going to be commited, recorded, executed every single usage cycle.
- // They are also going to be reseted when commited.
+ // Command buffers are going to be committed, recorded, executed every single usage cycle.
+ // They are also going to be reset when committed.
Pool& pool = pools.emplace_back();
pool.handle = device.GetLogical().CreateCommandPool({
.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
index 719edbcfb..673ab478e 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
@@ -172,7 +172,7 @@ RasterizerVulkan::RasterizerVulkan(Core::Frontend::EmuWindow& emu_window_, Tegra
buffer_cache(*this, cpu_memory_, buffer_cache_runtime),
pipeline_cache(*this, device, scheduler, descriptor_pool, update_descriptor_queue,
render_pass_cache, buffer_cache, texture_cache, gpu.ShaderNotify()),
- query_cache{*this, device, scheduler}, accelerate_dma{buffer_cache},
+ query_cache{*this, device, scheduler}, accelerate_dma(buffer_cache, texture_cache, scheduler),
fence_manager(*this, gpu, texture_cache, buffer_cache, query_cache, device, scheduler),
wfi_event(device.GetLogical().CreateEvent()) {
scheduler.SetQueryCache(query_cache);
@@ -671,7 +671,7 @@ bool RasterizerVulkan::AccelerateConditionalRendering() {
// TODO(Blinkhawk): Reimplement Host conditional rendering.
return false;
}
- // Medium / Low Hack: stub any checks on queries writen into the buffer cache.
+ // Medium / Low Hack: stub any checks on queries written into the buffer cache.
const GPUVAddr condition_address{maxwell3d->regs.render_enable.Address()};
Maxwell::ReportSemaphore::Compare cmp;
if (gpu_memory->IsMemoryDirty(condition_address, sizeof(cmp),
@@ -756,7 +756,9 @@ void RasterizerVulkan::FlushWork() {
draw_counter = 0;
}
-AccelerateDMA::AccelerateDMA(BufferCache& buffer_cache_) : buffer_cache{buffer_cache_} {}
+AccelerateDMA::AccelerateDMA(BufferCache& buffer_cache_, TextureCache& texture_cache_,
+ Scheduler& scheduler_)
+ : buffer_cache{buffer_cache_}, texture_cache{texture_cache_}, scheduler{scheduler_} {}
bool AccelerateDMA::BufferClear(GPUVAddr src_address, u64 amount, u32 value) {
std::scoped_lock lock{buffer_cache.mutex};
@@ -768,6 +770,46 @@ bool AccelerateDMA::BufferCopy(GPUVAddr src_address, GPUVAddr dest_address, u64
return buffer_cache.DMACopy(src_address, dest_address, amount);
}
+template <bool IS_IMAGE_UPLOAD>
+bool AccelerateDMA::DmaBufferImageCopy(const Tegra::DMA::ImageCopy& copy_info,
+ const Tegra::DMA::BufferOperand& buffer_operand,
+ const Tegra::DMA::ImageOperand& image_operand) {
+ std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex};
+ const auto image_id = texture_cache.DmaImageId(image_operand);
+ if (image_id == VideoCommon::NULL_IMAGE_ID) {
+ return false;
+ }
+ const u32 buffer_size = static_cast<u32>(buffer_operand.pitch * buffer_operand.height);
+ static constexpr auto sync_info = VideoCommon::ObtainBufferSynchronize::FullSynchronize;
+ const auto post_op = IS_IMAGE_UPLOAD ? VideoCommon::ObtainBufferOperation::DoNothing
+ : VideoCommon::ObtainBufferOperation::MarkAsWritten;
+ const auto [buffer, offset] =
+ buffer_cache.ObtainBuffer(buffer_operand.address, buffer_size, sync_info, post_op);
+
+ const auto [image, copy] = texture_cache.DmaBufferImageCopy(
+ copy_info, buffer_operand, image_operand, image_id, IS_IMAGE_UPLOAD);
+ const std::span copy_span{&copy, 1};
+
+ if constexpr (IS_IMAGE_UPLOAD) {
+ image->UploadMemory(buffer->Handle(), offset, copy_span);
+ } else {
+ image->DownloadMemory(buffer->Handle(), offset, copy_span);
+ }
+ return true;
+}
+
+bool AccelerateDMA::ImageToBuffer(const Tegra::DMA::ImageCopy& copy_info,
+ const Tegra::DMA::ImageOperand& image_operand,
+ const Tegra::DMA::BufferOperand& buffer_operand) {
+ return DmaBufferImageCopy<false>(copy_info, buffer_operand, image_operand);
+}
+
+bool AccelerateDMA::BufferToImage(const Tegra::DMA::ImageCopy& copy_info,
+ const Tegra::DMA::BufferOperand& buffer_operand,
+ const Tegra::DMA::ImageOperand& image_operand) {
+ return DmaBufferImageCopy<true>(copy_info, buffer_operand, image_operand);
+}
+
void RasterizerVulkan::UpdateDynamicStates() {
auto& regs = maxwell3d->regs;
UpdateViewportsState(regs);
@@ -1064,7 +1106,7 @@ void RasterizerVulkan::UpdateDepthBoundsTestEnable(Tegra::Engines::Maxwell3D::Re
LOG_WARNING(Render_Vulkan, "Depth bounds is enabled but not supported");
enabled = false;
}
- scheduler.Record([enable = regs.depth_bounds_enable](vk::CommandBuffer cmdbuf) {
+ scheduler.Record([enable = enabled](vk::CommandBuffer cmdbuf) {
cmdbuf.SetDepthBoundsTestEnableEXT(enable);
});
}
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h
index a0508b57c..1659fbc13 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.h
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.h
@@ -45,14 +45,28 @@ class StateTracker;
class AccelerateDMA : public Tegra::Engines::AccelerateDMAInterface {
public:
- explicit AccelerateDMA(BufferCache& buffer_cache);
+ explicit AccelerateDMA(BufferCache& buffer_cache, TextureCache& texture_cache,
+ Scheduler& scheduler);
bool BufferCopy(GPUVAddr start_address, GPUVAddr end_address, u64 amount) override;
bool BufferClear(GPUVAddr src_address, u64 amount, u32 value) override;
+ bool ImageToBuffer(const Tegra::DMA::ImageCopy& copy_info, const Tegra::DMA::ImageOperand& src,
+ const Tegra::DMA::BufferOperand& dst) override;
+
+ bool BufferToImage(const Tegra::DMA::ImageCopy& copy_info, const Tegra::DMA::BufferOperand& src,
+ const Tegra::DMA::ImageOperand& dst) override;
+
private:
+ template <bool IS_IMAGE_UPLOAD>
+ bool DmaBufferImageCopy(const Tegra::DMA::ImageCopy& copy_info,
+ const Tegra::DMA::BufferOperand& src,
+ const Tegra::DMA::ImageOperand& dst);
+
BufferCache& buffer_cache;
+ TextureCache& texture_cache;
+ Scheduler& scheduler;
};
class RasterizerVulkan final : public VideoCore::RasterizerAccelerated,
diff --git a/src/video_core/renderer_vulkan/vk_resource_pool.cpp b/src/video_core/renderer_vulkan/vk_resource_pool.cpp
index 6c8ac22f4..6572f82ba 100644
--- a/src/video_core/renderer_vulkan/vk_resource_pool.cpp
+++ b/src/video_core/renderer_vulkan/vk_resource_pool.cpp
@@ -37,7 +37,7 @@ size_t ResourcePool::CommitResource() {
found = free_resource;
}
}
- // Free iterator is hinted to the resource after the one that's been commited.
+ // Free iterator is hinted to the resource after the one that's been committed.
hint_iterator = (*found + 1) % ticks.size();
return *found;
}
@@ -46,7 +46,7 @@ size_t ResourcePool::ManageOverflow() {
const size_t old_capacity = ticks.size();
Grow();
- // The last entry is guaranted to be free, since it's the first element of the freshly
+ // The last entry is guaranteed to be free, since it's the first element of the freshly
// allocated resources.
return old_capacity;
}
diff --git a/src/video_core/renderer_vulkan/vk_scheduler.cpp b/src/video_core/renderer_vulkan/vk_scheduler.cpp
index e03685af1..c636a1625 100644
--- a/src/video_core/renderer_vulkan/vk_scheduler.cpp
+++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp
@@ -47,14 +47,15 @@ Scheduler::Scheduler(const Device& device_, StateTracker& state_tracker_)
Scheduler::~Scheduler() = default;
void Scheduler::Flush(VkSemaphore signal_semaphore, VkSemaphore wait_semaphore) {
+ // When flushing, we only send data to the worker thread; no waiting is necessary.
SubmitExecution(signal_semaphore, wait_semaphore);
AllocateNewContext();
}
void Scheduler::Finish(VkSemaphore signal_semaphore, VkSemaphore wait_semaphore) {
+ // When finishing, we need to wait for the submission to have executed on the device.
const u64 presubmit_tick = CurrentTick();
SubmitExecution(signal_semaphore, wait_semaphore);
- WaitWorker();
Wait(presubmit_tick);
AllocateNewContext();
}
@@ -63,8 +64,13 @@ void Scheduler::WaitWorker() {
MICROPROFILE_SCOPE(Vulkan_WaitForWorker);
DispatchWork();
- std::unique_lock lock{work_mutex};
- wait_cv.wait(lock, [this] { return work_queue.empty(); });
+ // Ensure the queue is drained.
+ std::unique_lock ql{queue_mutex};
+ event_cv.wait(ql, [this] { return work_queue.empty(); });
+
+ // Now wait for execution to finish.
+ // This needs to be done in the same order as WorkerThread.
+ std::unique_lock el{execution_mutex};
}
void Scheduler::DispatchWork() {
@@ -72,10 +78,10 @@ void Scheduler::DispatchWork() {
return;
}
{
- std::scoped_lock lock{work_mutex};
+ std::scoped_lock ql{queue_mutex};
work_queue.push(std::move(chunk));
}
- work_cv.notify_one();
+ event_cv.notify_all();
AcquireNewChunk();
}
@@ -137,30 +143,55 @@ bool Scheduler::UpdateRescaling(bool is_rescaling) {
void Scheduler::WorkerThread(std::stop_token stop_token) {
Common::SetCurrentThreadName("VulkanWorker");
- do {
+
+ const auto TryPopQueue{[this](auto& work) -> bool {
+ if (work_queue.empty()) {
+ return false;
+ }
+
+ work = std::move(work_queue.front());
+ work_queue.pop();
+ event_cv.notify_all();
+ return true;
+ }};
+
+ while (!stop_token.stop_requested()) {
std::unique_ptr<CommandChunk> work;
- bool has_submit{false};
+
{
- std::unique_lock lock{work_mutex};
- if (work_queue.empty()) {
- wait_cv.notify_all();
- }
- Common::CondvarWait(work_cv, lock, stop_token, [&] { return !work_queue.empty(); });
+ std::unique_lock lk{queue_mutex};
+
+ // Wait for work.
+ Common::CondvarWait(event_cv, lk, stop_token, [&] { return TryPopQueue(work); });
+
+ // If we've been asked to stop, we're done.
if (stop_token.stop_requested()) {
- continue;
+ return;
}
- work = std::move(work_queue.front());
- work_queue.pop();
- has_submit = work->HasSubmit();
+ // Exchange lock ownership so that we take the execution lock before
+ // the queue lock goes out of scope. This allows us to force execution
+ // to complete in the next step.
+ std::exchange(lk, std::unique_lock{execution_mutex});
+
+ // Perform the work, tracking whether the chunk was a submission
+ // before executing.
+ const bool has_submit = work->HasSubmit();
work->ExecuteAll(current_cmdbuf);
+
+ // If the chunk was a submission, reallocate the command buffer.
+ if (has_submit) {
+ AllocateWorkerCommandBuffer();
+ }
}
- if (has_submit) {
- AllocateWorkerCommandBuffer();
+
+ {
+ std::scoped_lock rl{reserve_mutex};
+
+ // Recycle the chunk back to the reserve.
+ chunk_reserve.emplace_back(std::move(work));
}
- std::scoped_lock reserve_lock{reserve_mutex};
- chunk_reserve.push_back(std::move(work));
- } while (!stop_token.stop_requested());
+ }
}
void Scheduler::AllocateWorkerCommandBuffer() {
@@ -289,13 +320,16 @@ void Scheduler::EndRenderPass() {
}
void Scheduler::AcquireNewChunk() {
- std::scoped_lock lock{reserve_mutex};
+ std::scoped_lock rl{reserve_mutex};
+
if (chunk_reserve.empty()) {
+ // If we don't have anything reserved, we need to make a new chunk.
chunk = std::make_unique<CommandChunk>();
- return;
+ } else {
+ // Otherwise, we can just take from the reserve.
+ chunk = std::make_unique<CommandChunk>();
+ chunk_reserve.pop_back();
}
- chunk = std::move(chunk_reserve.back());
- chunk_reserve.pop_back();
}
} // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_scheduler.h b/src/video_core/renderer_vulkan/vk_scheduler.h
index bd4cb0f7e..8d75ce987 100644
--- a/src/video_core/renderer_vulkan/vk_scheduler.h
+++ b/src/video_core/renderer_vulkan/vk_scheduler.h
@@ -232,10 +232,10 @@ private:
std::queue<std::unique_ptr<CommandChunk>> work_queue;
std::vector<std::unique_ptr<CommandChunk>> chunk_reserve;
+ std::mutex execution_mutex;
std::mutex reserve_mutex;
- std::mutex work_mutex;
- std::condition_variable_any work_cv;
- std::condition_variable wait_cv;
+ std::mutex queue_mutex;
+ std::condition_variable_any event_cv;
std::jthread worker_thread;
};
diff --git a/src/video_core/renderer_vulkan/vk_swapchain.cpp b/src/video_core/renderer_vulkan/vk_swapchain.cpp
index b6810eef9..85fdce6e5 100644
--- a/src/video_core/renderer_vulkan/vk_swapchain.cpp
+++ b/src/video_core/renderer_vulkan/vk_swapchain.cpp
@@ -159,7 +159,7 @@ void Swapchain::CreateSwapchain(const VkSurfaceCapabilitiesKHR& capabilities, bo
present_mode = ChooseSwapPresentMode(present_modes);
u32 requested_image_count{capabilities.minImageCount + 1};
- // Ensure Tripple buffering if possible.
+ // Ensure Triple buffering if possible.
if (capabilities.maxImageCount > 0) {
if (requested_image_count > capabilities.maxImageCount) {
requested_image_count = capabilities.maxImageCount;
diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp
index 80adb70eb..ae15f6976 100644
--- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp
@@ -189,13 +189,16 @@ constexpr VkBorderColor ConvertBorderColor(const std::array<float, 4>& color) {
if (info.IsRenderTarget()) {
return ImageAspectMask(info.format);
}
- const bool is_first = info.Swizzle()[0] == SwizzleSource::R;
+ bool any_r =
+ std::ranges::any_of(info.Swizzle(), [](SwizzleSource s) { return s == SwizzleSource::R; });
switch (info.format) {
case PixelFormat::D24_UNORM_S8_UINT:
case PixelFormat::D32_FLOAT_S8_UINT:
- return is_first ? VK_IMAGE_ASPECT_DEPTH_BIT : VK_IMAGE_ASPECT_STENCIL_BIT;
+ // R = depth, G = stencil
+ return any_r ? VK_IMAGE_ASPECT_DEPTH_BIT : VK_IMAGE_ASPECT_STENCIL_BIT;
case PixelFormat::S8_UINT_D24_UNORM:
- return is_first ? VK_IMAGE_ASPECT_STENCIL_BIT : VK_IMAGE_ASPECT_DEPTH_BIT;
+ // R = stencil, G = depth
+ return any_r ? VK_IMAGE_ASPECT_STENCIL_BIT : VK_IMAGE_ASPECT_DEPTH_BIT;
case PixelFormat::D16_UNORM:
case PixelFormat::D32_FLOAT:
return VK_IMAGE_ASPECT_DEPTH_BIT;
@@ -864,13 +867,19 @@ void TextureCacheRuntime::ReinterpretImage(Image& dst, Image& src,
const VkImageAspectFlags src_aspect_mask = src.AspectMask();
const VkImageAspectFlags dst_aspect_mask = dst.AspectMask();
- std::ranges::transform(copies, vk_in_copies.begin(), [src_aspect_mask](const auto& copy) {
- return MakeBufferImageCopy(copy, true, src_aspect_mask);
- });
+ const auto bpp_in = BytesPerBlock(src.info.format) / DefaultBlockWidth(src.info.format);
+ const auto bpp_out = BytesPerBlock(dst.info.format) / DefaultBlockWidth(dst.info.format);
+ std::ranges::transform(copies, vk_in_copies.begin(),
+ [src_aspect_mask, bpp_in, bpp_out](const auto& copy) {
+ auto copy2 = copy;
+ copy2.src_offset.x = (bpp_out * copy.src_offset.x) / bpp_in;
+ copy2.extent.width = (bpp_out * copy.extent.width) / bpp_in;
+ return MakeBufferImageCopy(copy2, true, src_aspect_mask);
+ });
std::ranges::transform(copies, vk_out_copies.begin(), [dst_aspect_mask](const auto& copy) {
return MakeBufferImageCopy(copy, false, dst_aspect_mask);
});
- const u32 img_bpp = BytesPerBlock(src.info.format);
+ const u32 img_bpp = BytesPerBlock(dst.info.format);
size_t total_size = 0;
for (const auto& copy : copies) {
total_size += copy.extent.width * copy.extent.height * copy.extent.depth * img_bpp;
@@ -1306,15 +1315,16 @@ Image::Image(const VideoCommon::NullImageParams& params) : VideoCommon::ImageBas
Image::~Image() = default;
-void Image::UploadMemory(const StagingBufferRef& map, std::span<const BufferImageCopy> copies) {
+void Image::UploadMemory(VkBuffer buffer, VkDeviceSize offset,
+ std::span<const VideoCommon::BufferImageCopy> copies) {
// TODO: Move this to another API
const bool is_rescaled = True(flags & ImageFlagBits::Rescaled);
if (is_rescaled) {
ScaleDown(true);
}
scheduler->RequestOutsideRenderPassOperationContext();
- std::vector vk_copies = TransformBufferImageCopies(copies, map.offset, aspect_mask);
- const VkBuffer src_buffer = map.buffer;
+ std::vector vk_copies = TransformBufferImageCopies(copies, offset, aspect_mask);
+ const VkBuffer src_buffer = buffer;
const VkImage vk_image = *original_image;
const VkImageAspectFlags vk_aspect_mask = aspect_mask;
const bool is_initialized = std::exchange(initialized, true);
@@ -1327,14 +1337,19 @@ void Image::UploadMemory(const StagingBufferRef& map, std::span<const BufferImag
}
}
-void Image::DownloadMemory(const StagingBufferRef& map, std::span<const BufferImageCopy> copies) {
+void Image::UploadMemory(const StagingBufferRef& map, std::span<const BufferImageCopy> copies) {
+ UploadMemory(map.buffer, map.offset, copies);
+}
+
+void Image::DownloadMemory(VkBuffer buffer, VkDeviceSize offset,
+ std::span<const VideoCommon::BufferImageCopy> copies) {
const bool is_rescaled = True(flags & ImageFlagBits::Rescaled);
if (is_rescaled) {
ScaleDown();
}
- std::vector vk_copies = TransformBufferImageCopies(copies, map.offset, aspect_mask);
+ std::vector vk_copies = TransformBufferImageCopies(copies, offset, aspect_mask);
scheduler->RequestOutsideRenderPassOperationContext();
- scheduler->Record([buffer = map.buffer, image = *original_image, aspect_mask = aspect_mask,
+ scheduler->Record([buffer, image = *original_image, aspect_mask = aspect_mask,
vk_copies](vk::CommandBuffer cmdbuf) {
const VkImageMemoryBarrier read_barrier{
.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
@@ -1389,6 +1404,10 @@ void Image::DownloadMemory(const StagingBufferRef& map, std::span<const BufferIm
}
}
+void Image::DownloadMemory(const StagingBufferRef& map, std::span<const BufferImageCopy> copies) {
+ DownloadMemory(map.buffer, map.offset, copies);
+}
+
bool Image::IsRescaled() const noexcept {
return True(flags & ImageFlagBits::Rescaled);
}
@@ -1763,7 +1782,7 @@ Sampler::Sampler(TextureCacheRuntime& runtime, const Tegra::Texture::TSCEntry& t
.minLod = tsc.mipmap_filter == TextureMipmapFilter::None ? 0.0f : tsc.MinLod(),
.maxLod = tsc.mipmap_filter == TextureMipmapFilter::None ? 0.25f : tsc.MaxLod(),
.borderColor =
- arbitrary_borders ? VK_BORDER_COLOR_INT_CUSTOM_EXT : ConvertBorderColor(color),
+ arbitrary_borders ? VK_BORDER_COLOR_FLOAT_CUSTOM_EXT : ConvertBorderColor(color),
.unnormalizedCoordinates = VK_FALSE,
});
}
diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.h b/src/video_core/renderer_vulkan/vk_texture_cache.h
index 0ce39616f..d5ee23f8d 100644
--- a/src/video_core/renderer_vulkan/vk_texture_cache.h
+++ b/src/video_core/renderer_vulkan/vk_texture_cache.h
@@ -132,9 +132,15 @@ public:
Image(Image&&) = default;
Image& operator=(Image&&) = default;
+ void UploadMemory(VkBuffer buffer, VkDeviceSize offset,
+ std::span<const VideoCommon::BufferImageCopy> copies);
+
void UploadMemory(const StagingBufferRef& map,
std::span<const VideoCommon::BufferImageCopy> copies);
+ void DownloadMemory(VkBuffer buffer, VkDeviceSize offset,
+ std::span<const VideoCommon::BufferImageCopy> copies);
+
void DownloadMemory(const StagingBufferRef& map,
std::span<const VideoCommon::BufferImageCopy> copies);
diff --git a/src/video_core/renderer_vulkan/vk_update_descriptor.cpp b/src/video_core/renderer_vulkan/vk_update_descriptor.cpp
index 4d4a6753b..009dab0b6 100644
--- a/src/video_core/renderer_vulkan/vk_update_descriptor.cpp
+++ b/src/video_core/renderer_vulkan/vk_update_descriptor.cpp
@@ -25,7 +25,7 @@ void UpdateDescriptorQueue::TickFrame() {
void UpdateDescriptorQueue::Acquire() {
// Minimum number of entries required.
- // This is the maximum number of entries a single draw call migth use.
+ // This is the maximum number of entries a single draw call might use.
static constexpr size_t MIN_ENTRIES = 0x400;
if (std::distance(payload.data(), payload_cursor) + MIN_ENTRIES >= payload.max_size()) {
diff --git a/src/video_core/texture_cache/format_lookup_table.cpp b/src/video_core/texture_cache/format_lookup_table.cpp
index 08aa8ca33..5fc2b2fec 100644
--- a/src/video_core/texture_cache/format_lookup_table.cpp
+++ b/src/video_core/texture_cache/format_lookup_table.cpp
@@ -42,15 +42,15 @@ PixelFormat PixelFormatFromTextureInfo(TextureFormat format, ComponentType red,
ComponentType blue, ComponentType alpha,
bool is_srgb) noexcept {
switch (Hash(format, red, green, blue, alpha, is_srgb)) {
- case Hash(TextureFormat::A8R8G8B8, UNORM):
+ case Hash(TextureFormat::A8B8G8R8, UNORM):
return PixelFormat::A8B8G8R8_UNORM;
- case Hash(TextureFormat::A8R8G8B8, SNORM):
+ case Hash(TextureFormat::A8B8G8R8, SNORM):
return PixelFormat::A8B8G8R8_SNORM;
- case Hash(TextureFormat::A8R8G8B8, UINT):
+ case Hash(TextureFormat::A8B8G8R8, UINT):
return PixelFormat::A8B8G8R8_UINT;
- case Hash(TextureFormat::A8R8G8B8, SINT):
+ case Hash(TextureFormat::A8B8G8R8, SINT):
return PixelFormat::A8B8G8R8_SINT;
- case Hash(TextureFormat::A8R8G8B8, UNORM, SRGB):
+ case Hash(TextureFormat::A8B8G8R8, UNORM, SRGB):
return PixelFormat::A8B8G8R8_SRGB;
case Hash(TextureFormat::B5G6R5, UNORM):
return PixelFormat::B5G6R5_UNORM;
@@ -74,13 +74,13 @@ PixelFormat PixelFormatFromTextureInfo(TextureFormat format, ComponentType red,
return PixelFormat::R8_UINT;
case Hash(TextureFormat::R8, SINT):
return PixelFormat::R8_SINT;
- case Hash(TextureFormat::R8G8, UNORM):
+ case Hash(TextureFormat::G8R8, UNORM):
return PixelFormat::R8G8_UNORM;
- case Hash(TextureFormat::R8G8, SNORM):
+ case Hash(TextureFormat::G8R8, SNORM):
return PixelFormat::R8G8_SNORM;
- case Hash(TextureFormat::R8G8, UINT):
+ case Hash(TextureFormat::G8R8, UINT):
return PixelFormat::R8G8_UINT;
- case Hash(TextureFormat::R8G8, SINT):
+ case Hash(TextureFormat::G8R8, SINT):
return PixelFormat::R8G8_SINT;
case Hash(TextureFormat::R16G16B16A16, FLOAT):
return PixelFormat::R16G16B16A16_FLOAT;
@@ -136,49 +136,49 @@ PixelFormat PixelFormatFromTextureInfo(TextureFormat format, ComponentType red,
return PixelFormat::R32_SINT;
case Hash(TextureFormat::E5B9G9R9, FLOAT):
return PixelFormat::E5B9G9R9_FLOAT;
- case Hash(TextureFormat::D32, FLOAT):
+ case Hash(TextureFormat::Z32, FLOAT):
return PixelFormat::D32_FLOAT;
- case Hash(TextureFormat::D16, UNORM):
+ case Hash(TextureFormat::Z16, UNORM):
return PixelFormat::D16_UNORM;
- case Hash(TextureFormat::S8D24, UINT, UNORM, UNORM, UNORM, LINEAR):
+ case Hash(TextureFormat::Z24S8, UINT, UNORM, UNORM, UNORM, LINEAR):
return PixelFormat::S8_UINT_D24_UNORM;
- case Hash(TextureFormat::S8D24, UINT, UNORM, UINT, UINT, LINEAR):
+ case Hash(TextureFormat::Z24S8, UINT, UNORM, UINT, UINT, LINEAR):
return PixelFormat::S8_UINT_D24_UNORM;
- case Hash(TextureFormat::R8G24, UINT, UNORM, UNORM, UNORM, LINEAR):
+ case Hash(TextureFormat::G24R8, UINT, UNORM, UNORM, UNORM, LINEAR):
return PixelFormat::S8_UINT_D24_UNORM;
- case Hash(TextureFormat::D24S8, UNORM, UINT, UINT, UINT, LINEAR):
+ case Hash(TextureFormat::S8Z24, UNORM, UINT, UINT, UINT, LINEAR):
return PixelFormat::D24_UNORM_S8_UINT;
- case Hash(TextureFormat::D32S8, FLOAT, UINT, UNORM, UNORM, LINEAR):
+ case Hash(TextureFormat::Z32_X24S8, FLOAT, UINT, UNORM, UNORM, LINEAR):
return PixelFormat::D32_FLOAT_S8_UINT;
- case Hash(TextureFormat::R32_B24G8, FLOAT, UINT, UNORM, UNORM, LINEAR):
+ case Hash(TextureFormat::R32B24G8, FLOAT, UINT, UNORM, UNORM, LINEAR):
return PixelFormat::D32_FLOAT_S8_UINT;
- case Hash(TextureFormat::BC1_RGBA, UNORM, LINEAR):
+ case Hash(TextureFormat::DXT1, UNORM, LINEAR):
return PixelFormat::BC1_RGBA_UNORM;
- case Hash(TextureFormat::BC1_RGBA, UNORM, SRGB):
+ case Hash(TextureFormat::DXT1, UNORM, SRGB):
return PixelFormat::BC1_RGBA_SRGB;
- case Hash(TextureFormat::BC2, UNORM, LINEAR):
+ case Hash(TextureFormat::DXT23, UNORM, LINEAR):
return PixelFormat::BC2_UNORM;
- case Hash(TextureFormat::BC2, UNORM, SRGB):
+ case Hash(TextureFormat::DXT23, UNORM, SRGB):
return PixelFormat::BC2_SRGB;
- case Hash(TextureFormat::BC3, UNORM, LINEAR):
+ case Hash(TextureFormat::DXT45, UNORM, LINEAR):
return PixelFormat::BC3_UNORM;
- case Hash(TextureFormat::BC3, UNORM, SRGB):
+ case Hash(TextureFormat::DXT45, UNORM, SRGB):
return PixelFormat::BC3_SRGB;
- case Hash(TextureFormat::BC4, UNORM):
+ case Hash(TextureFormat::DXN1, UNORM):
return PixelFormat::BC4_UNORM;
- case Hash(TextureFormat::BC4, SNORM):
+ case Hash(TextureFormat::DXN1, SNORM):
return PixelFormat::BC4_SNORM;
- case Hash(TextureFormat::BC5, UNORM):
+ case Hash(TextureFormat::DXN2, UNORM):
return PixelFormat::BC5_UNORM;
- case Hash(TextureFormat::BC5, SNORM):
+ case Hash(TextureFormat::DXN2, SNORM):
return PixelFormat::BC5_SNORM;
- case Hash(TextureFormat::BC7, UNORM, LINEAR):
+ case Hash(TextureFormat::BC7U, UNORM, LINEAR):
return PixelFormat::BC7_UNORM;
- case Hash(TextureFormat::BC7, UNORM, SRGB):
+ case Hash(TextureFormat::BC7U, UNORM, SRGB):
return PixelFormat::BC7_SRGB;
- case Hash(TextureFormat::BC6H_SFLOAT, FLOAT):
+ case Hash(TextureFormat::BC6H_S16, FLOAT):
return PixelFormat::BC6H_SFLOAT;
- case Hash(TextureFormat::BC6H_UFLOAT, FLOAT):
+ case Hash(TextureFormat::BC6H_U16, FLOAT):
return PixelFormat::BC6H_UFLOAT;
case Hash(TextureFormat::ASTC_2D_4X4, UNORM, LINEAR):
return PixelFormat::ASTC_2D_4X4_UNORM;
diff --git a/src/video_core/texture_cache/image_base.h b/src/video_core/texture_cache/image_base.h
index e8fa592d2..329396bb6 100644
--- a/src/video_core/texture_cache/image_base.h
+++ b/src/video_core/texture_cache/image_base.h
@@ -25,7 +25,7 @@ enum class ImageFlagBits : u32 {
Registered = 1 << 6, ///< True when the image is registered
Picked = 1 << 7, ///< Temporary flag to mark the image as picked
Remapped = 1 << 8, ///< Image has been remapped.
- Sparse = 1 << 9, ///< Image has non continous submemory.
+ Sparse = 1 << 9, ///< Image has non continuous submemory.
// Garbage Collection Flags
BadOverlap = 1 << 10, ///< This image overlaps other but doesn't fit, has higher
diff --git a/src/video_core/texture_cache/image_info.cpp b/src/video_core/texture_cache/image_info.cpp
index e9100091e..a1296b574 100644
--- a/src/video_core/texture_cache/image_info.cpp
+++ b/src/video_core/texture_cache/image_info.cpp
@@ -216,10 +216,51 @@ ImageInfo::ImageInfo(const Tegra::Engines::Fermi2D::Surface& config) noexcept {
.height = config.height,
.depth = 1,
};
- rescaleable = block.depth == 0;
- rescaleable &= size.height > 256;
+ rescaleable = block.depth == 0 && size.height > 256;
downscaleable = size.height > 512;
}
}
+static PixelFormat ByteSizeToFormat(u32 bytes_per_pixel) {
+ switch (bytes_per_pixel) {
+ case 1:
+ return PixelFormat::R8_UINT;
+ case 2:
+ return PixelFormat::R8G8_UINT;
+ case 4:
+ return PixelFormat::A8B8G8R8_UINT;
+ case 8:
+ return PixelFormat::R16G16B16A16_UINT;
+ case 16:
+ return PixelFormat::R32G32B32A32_UINT;
+ default:
+ UNIMPLEMENTED();
+ return PixelFormat::Invalid;
+ }
+}
+
+ImageInfo::ImageInfo(const Tegra::DMA::ImageOperand& config) noexcept {
+ const u32 bytes_per_pixel = config.bytes_per_pixel;
+ format = ByteSizeToFormat(bytes_per_pixel);
+ type = config.params.block_size.depth > 0 ? ImageType::e3D : ImageType::e2D;
+ num_samples = 1;
+ block = Extent3D{
+ .width = config.params.block_size.width,
+ .height = config.params.block_size.height,
+ .depth = config.params.block_size.depth,
+ };
+ size = Extent3D{
+ .width = config.params.width,
+ .height = config.params.height,
+ .depth = config.params.depth,
+ };
+ tile_width_spacing = 0;
+ resources.levels = 1;
+ resources.layers = 1;
+ layer_stride = CalculateLayerStride(*this);
+ maybe_unaligned_layer_stride = CalculateLayerSize(*this);
+ rescaleable = block.depth == 0 && size.height > 256;
+ downscaleable = size.height > 512;
+}
+
} // namespace VideoCommon
diff --git a/src/video_core/texture_cache/image_info.h b/src/video_core/texture_cache/image_info.h
index 93755e15e..a12f5b44f 100644
--- a/src/video_core/texture_cache/image_info.h
+++ b/src/video_core/texture_cache/image_info.h
@@ -5,6 +5,7 @@
#include "video_core/engines/fermi_2d.h"
#include "video_core/engines/maxwell_3d.h"
+#include "video_core/engines/maxwell_dma.h"
#include "video_core/surface.h"
#include "video_core/texture_cache/types.h"
@@ -19,6 +20,7 @@ struct ImageInfo {
explicit ImageInfo(const Tegra::Engines::Maxwell3D::Regs& regs, size_t index) noexcept;
explicit ImageInfo(const Tegra::Engines::Maxwell3D::Regs& regs) noexcept;
explicit ImageInfo(const Tegra::Engines::Fermi2D::Surface& config) noexcept;
+ explicit ImageInfo(const Tegra::DMA::ImageOperand& config) noexcept;
PixelFormat format = PixelFormat::Invalid;
ImageType type = ImageType::e1D;
diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h
index 9dd152fbe..8e8b9a5e6 100644
--- a/src/video_core/texture_cache/texture_cache.h
+++ b/src/video_core/texture_cache/texture_cache.h
@@ -745,6 +745,25 @@ void TextureCache<P>::PopAsyncFlushes() {
}
template <class P>
+ImageId TextureCache<P>::DmaImageId(const Tegra::DMA::ImageOperand& operand) {
+ const ImageInfo dst_info(operand);
+ const ImageId dst_id = FindDMAImage(dst_info, operand.address);
+ if (!dst_id) {
+ return NULL_IMAGE_ID;
+ }
+ const auto& image = slot_images[dst_id];
+ if (False(image.flags & ImageFlagBits::GpuModified)) {
+ // No need to waste time on an image that's synced with guest
+ return NULL_IMAGE_ID;
+ }
+ const auto base = image.TryFindBase(operand.address);
+ if (!base) {
+ return NULL_IMAGE_ID;
+ }
+ return dst_id;
+}
+
+template <class P>
bool TextureCache<P>::IsRescaling() const noexcept {
return is_rescaling;
}
@@ -772,6 +791,49 @@ bool TextureCache<P>::IsRegionGpuModified(VAddr addr, size_t size) {
}
template <class P>
+std::pair<typename TextureCache<P>::Image*, BufferImageCopy> TextureCache<P>::DmaBufferImageCopy(
+ const Tegra::DMA::ImageCopy& copy_info, const Tegra::DMA::BufferOperand& buffer_operand,
+ const Tegra::DMA::ImageOperand& image_operand, ImageId image_id, bool modifies_image) {
+ const auto [level, base] = PrepareDmaImage(image_id, image_operand.address, modifies_image);
+ auto* image = &slot_images[image_id];
+ const u32 buffer_size = static_cast<u32>(buffer_operand.pitch * buffer_operand.height);
+ const u32 bpp = VideoCore::Surface::BytesPerBlock(image->info.format);
+ const auto convert = [old_bpp = image_operand.bytes_per_pixel, bpp](u32 value) {
+ return (old_bpp * value) / bpp;
+ };
+ const u32 base_x = convert(image_operand.params.origin.x.Value());
+ const u32 base_y = image_operand.params.origin.y.Value();
+ const u32 length_x = convert(copy_info.length_x);
+ const u32 length_y = copy_info.length_y;
+
+ const BufferImageCopy copy{
+ .buffer_offset = 0,
+ .buffer_size = buffer_size,
+ .buffer_row_length = convert(buffer_operand.pitch),
+ .buffer_image_height = buffer_operand.height,
+ .image_subresource =
+ {
+ .base_level = static_cast<s32>(level),
+ .base_layer = static_cast<s32>(base),
+ .num_layers = 1,
+ },
+ .image_offset =
+ {
+ .x = static_cast<s32>(base_x),
+ .y = static_cast<s32>(base_y),
+ .z = 0,
+ },
+ .image_extent =
+ {
+ .width = length_x,
+ .height = length_y,
+ .depth = 1,
+ },
+ };
+ return {image, copy};
+}
+
+template <class P>
void TextureCache<P>::RefreshContents(Image& image, ImageId image_id) {
if (False(image.flags & ImageFlagBits::CpuModified)) {
// Only upload modified images
@@ -1359,6 +1421,63 @@ std::optional<typename TextureCache<P>::BlitImages> TextureCache<P>::GetBlitImag
}
template <class P>
+ImageId TextureCache<P>::FindDMAImage(const ImageInfo& info, GPUVAddr gpu_addr) {
+ std::optional<VAddr> cpu_addr = gpu_memory->GpuToCpuAddress(gpu_addr);
+ if (!cpu_addr) {
+ cpu_addr = gpu_memory->GpuToCpuAddress(gpu_addr, CalculateGuestSizeInBytes(info));
+ if (!cpu_addr) {
+ return ImageId{};
+ }
+ }
+ ImageId image_id{};
+ boost::container::small_vector<ImageId, 1> image_ids;
+ const auto lambda = [&](ImageId existing_image_id, ImageBase& existing_image) {
+ if (True(existing_image.flags & ImageFlagBits::Remapped)) {
+ return false;
+ }
+ if (info.type == ImageType::Linear || existing_image.info.type == ImageType::Linear)
+ [[unlikely]] {
+ const bool strict_size = True(existing_image.flags & ImageFlagBits::Strong);
+ const ImageInfo& existing = existing_image.info;
+ if (existing_image.gpu_addr == gpu_addr && existing.type == info.type &&
+ existing.pitch == info.pitch &&
+ IsPitchLinearSameSize(existing, info, strict_size) &&
+ IsViewCompatible(existing.format, info.format, false, true)) {
+ image_id = existing_image_id;
+ image_ids.push_back(existing_image_id);
+ return true;
+ }
+ } else if (IsSubCopy(info, existing_image, gpu_addr)) {
+ image_id = existing_image_id;
+ image_ids.push_back(existing_image_id);
+ return true;
+ }
+ return false;
+ };
+ ForEachImageInRegion(*cpu_addr, CalculateGuestSizeInBytes(info), lambda);
+ if (image_ids.size() <= 1) [[likely]] {
+ return image_id;
+ }
+ auto image_ids_compare = [this](ImageId a, ImageId b) {
+ auto& image_a = slot_images[a];
+ auto& image_b = slot_images[b];
+ return image_a.modification_tick < image_b.modification_tick;
+ };
+ return *std::ranges::max_element(image_ids, image_ids_compare);
+}
+
+template <class P>
+std::pair<u32, u32> TextureCache<P>::PrepareDmaImage(ImageId dst_id, GPUVAddr base_addr,
+ bool mark_as_modified) {
+ const auto& image = slot_images[dst_id];
+ const auto base = image.TryFindBase(base_addr);
+ PrepareImage(dst_id, mark_as_modified, false);
+ const auto& new_image = slot_images[dst_id];
+ lru_cache.Touch(new_image.lru_index, frame_tick);
+ return std::make_pair(base->level, base->layer);
+}
+
+template <class P>
SamplerId TextureCache<P>::FindSampler(const TSCEntry& config) {
if (std::ranges::all_of(config.raw, [](u64 value) { return value == 0; })) {
return NULL_SAMPLER_ID;
diff --git a/src/video_core/texture_cache/texture_cache_base.h b/src/video_core/texture_cache/texture_cache_base.h
index 013836933..5a5b4179c 100644
--- a/src/video_core/texture_cache/texture_cache_base.h
+++ b/src/video_core/texture_cache/texture_cache_base.h
@@ -209,6 +209,12 @@ public:
/// Pop asynchronous downloads
void PopAsyncFlushes();
+ [[nodiscard]] ImageId DmaImageId(const Tegra::DMA::ImageOperand& operand);
+
+ [[nodiscard]] std::pair<Image*, BufferImageCopy> DmaBufferImageCopy(
+ const Tegra::DMA::ImageCopy& copy_info, const Tegra::DMA::BufferOperand& buffer_operand,
+ const Tegra::DMA::ImageOperand& image_operand, ImageId image_id, bool modifies_image);
+
/// Return true when a CPU region is modified from the GPU
[[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size);
@@ -300,6 +306,8 @@ private:
/// Remove joined images from the cache
[[nodiscard]] ImageId JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VAddr cpu_addr);
+ [[nodiscard]] ImageId FindDMAImage(const ImageInfo& info, GPUVAddr gpu_addr);
+
/// Return a blit image pair from the given guest blit parameters
[[nodiscard]] std::optional<BlitImages> GetBlitImages(
const Tegra::Engines::Fermi2D::Surface& dst, const Tegra::Engines::Fermi2D::Surface& src,
@@ -381,6 +389,9 @@ private:
/// Returns true if the current clear parameters clear the whole image of a given image view
[[nodiscard]] bool IsFullClear(ImageViewId id);
+ [[nodiscard]] std::pair<u32, u32> PrepareDmaImage(ImageId dst_id, GPUVAddr base_addr,
+ bool mark_as_modified);
+
bool ImageCanRescale(ImageBase& image);
void InvalidateScale(Image& image);
bool ScaleUp(Image& image);
diff --git a/src/video_core/texture_cache/types.h b/src/video_core/texture_cache/types.h
index 0453456b4..a0e10643f 100644
--- a/src/video_core/texture_cache/types.h
+++ b/src/video_core/texture_cache/types.h
@@ -54,6 +54,7 @@ enum class RelaxedOptions : u32 {
Format = 1 << 1,
Samples = 1 << 2,
ForceBrokenViews = 1 << 3,
+ FormatBpp = 1 << 4,
};
DECLARE_ENUM_FLAG_OPERATORS(RelaxedOptions)
diff --git a/src/video_core/texture_cache/util.cpp b/src/video_core/texture_cache/util.cpp
index 697f86641..de37db684 100644
--- a/src/video_core/texture_cache/util.cpp
+++ b/src/video_core/texture_cache/util.cpp
@@ -743,6 +743,44 @@ std::vector<ImageCopy> MakeShrinkImageCopies(const ImageInfo& dst, const ImageIn
return copies;
}
+std::vector<ImageCopy> MakeReinterpretImageCopies(const ImageInfo& src, u32 up_scale,
+ u32 down_shift) {
+ std::vector<ImageCopy> copies;
+ copies.reserve(src.resources.levels);
+ const bool is_3d = src.type == ImageType::e3D;
+ for (s32 level = 0; level < src.resources.levels; ++level) {
+ ImageCopy& copy = copies.emplace_back();
+ copy.src_subresource = SubresourceLayers{
+ .base_level = level,
+ .base_layer = 0,
+ .num_layers = src.resources.layers,
+ };
+ copy.dst_subresource = SubresourceLayers{
+ .base_level = level,
+ .base_layer = 0,
+ .num_layers = src.resources.layers,
+ };
+ copy.src_offset = Offset3D{
+ .x = 0,
+ .y = 0,
+ .z = 0,
+ };
+ copy.dst_offset = Offset3D{
+ .x = 0,
+ .y = 0,
+ .z = 0,
+ };
+ const Extent3D mip_size = AdjustMipSize(src.size, level);
+ copy.extent = AdjustSamplesSize(mip_size, src.num_samples);
+ if (is_3d) {
+ copy.extent.depth = src.size.depth;
+ }
+ copy.extent.width = std::max<u32>((copy.extent.width * up_scale) >> down_shift, 1);
+ copy.extent.height = std::max<u32>((copy.extent.height * up_scale) >> down_shift, 1);
+ }
+ return copies;
+}
+
bool IsValidEntry(const Tegra::MemoryManager& gpu_memory, const TICEntry& config) {
const GPUVAddr address = config.Address();
if (address == 0) {
@@ -999,6 +1037,20 @@ bool IsBlockLinearSizeCompatible(const ImageInfo& lhs, const ImageInfo& rhs, u32
}
}
+bool IsBlockLinearSizeCompatibleBPPRelaxed(const ImageInfo& lhs, const ImageInfo& rhs,
+ u32 lhs_level, u32 rhs_level) noexcept {
+ ASSERT(lhs.type != ImageType::Linear);
+ ASSERT(rhs.type != ImageType::Linear);
+ const auto lhs_bpp = BytesPerBlock(lhs.format);
+ const auto rhs_bpp = BytesPerBlock(rhs.format);
+ const Extent3D lhs_size = AdjustMipSize(lhs.size, lhs_level);
+ const Extent3D rhs_size = AdjustMipSize(rhs.size, rhs_level);
+ return Common::AlignUpLog2(lhs_size.width * lhs_bpp, GOB_SIZE_X_SHIFT) ==
+ Common::AlignUpLog2(rhs_size.width * rhs_bpp, GOB_SIZE_X_SHIFT) &&
+ Common::AlignUpLog2(lhs_size.height, GOB_SIZE_Y_SHIFT) ==
+ Common::AlignUpLog2(rhs_size.height, GOB_SIZE_Y_SHIFT);
+}
+
bool IsPitchLinearSameSize(const ImageInfo& lhs, const ImageInfo& rhs, bool strict_size) noexcept {
ASSERT(lhs.type == ImageType::Linear);
ASSERT(rhs.type == ImageType::Linear);
@@ -1073,7 +1125,8 @@ std::optional<SubresourceBase> FindSubresource(const ImageInfo& candidate, const
// Format checking is relaxed, but we still have to check for matching bytes per block.
// This avoids creating a view for blits on UE4 titles where formats with different bytes
// per block are aliased.
- if (BytesPerBlock(existing.format) != BytesPerBlock(candidate.format)) {
+ if (BytesPerBlock(existing.format) != BytesPerBlock(candidate.format) &&
+ False(options & RelaxedOptions::FormatBpp)) {
return std::nullopt;
}
} else {
@@ -1088,10 +1141,8 @@ std::optional<SubresourceBase> FindSubresource(const ImageInfo& candidate, const
if (existing.type != candidate.type) {
return std::nullopt;
}
- if (False(options & RelaxedOptions::Samples)) {
- if (existing.num_samples != candidate.num_samples) {
- return std::nullopt;
- }
+ if (False(options & RelaxedOptions::Samples) && existing.num_samples != candidate.num_samples) {
+ return std::nullopt;
}
if (existing.resources.levels < candidate.resources.levels + base->level) {
return std::nullopt;
@@ -1101,14 +1152,16 @@ std::optional<SubresourceBase> FindSubresource(const ImageInfo& candidate, const
if (mip_depth < candidate.size.depth + base->layer) {
return std::nullopt;
}
- } else {
- if (existing.resources.layers < candidate.resources.layers + base->layer) {
- return std::nullopt;
- }
+ } else if (existing.resources.layers < candidate.resources.layers + base->layer) {
+ return std::nullopt;
}
const bool strict_size = False(options & RelaxedOptions::Size);
if (!IsBlockLinearSizeCompatible(existing, candidate, base->level, 0, strict_size)) {
- return std::nullopt;
+ if (False(options & RelaxedOptions::FormatBpp)) {
+ return std::nullopt;
+ } else if (!IsBlockLinearSizeCompatibleBPPRelaxed(existing, candidate, base->level, 0)) {
+ return std::nullopt;
+ }
}
// TODO: compare block sizes
return base;
@@ -1120,6 +1173,31 @@ bool IsSubresource(const ImageInfo& candidate, const ImageBase& image, GPUVAddr
.has_value();
}
+bool IsSubCopy(const ImageInfo& candidate, const ImageBase& image, GPUVAddr candidate_addr) {
+ const std::optional<SubresourceBase> base = image.TryFindBase(candidate_addr);
+ if (!base) {
+ return false;
+ }
+ const ImageInfo& existing = image.info;
+ if (existing.resources.levels < candidate.resources.levels + base->level) {
+ return false;
+ }
+ if (existing.type == ImageType::e3D) {
+ const u32 mip_depth = std::max(1U, existing.size.depth << base->level);
+ if (mip_depth < candidate.size.depth + base->layer) {
+ return false;
+ }
+ } else {
+ if (existing.resources.layers < candidate.resources.layers + base->layer) {
+ return false;
+ }
+ }
+ if (!IsBlockLinearSizeCompatibleBPPRelaxed(existing, candidate, base->level, 0)) {
+ return false;
+ }
+ return true;
+}
+
void DeduceBlitImages(ImageInfo& dst_info, ImageInfo& src_info, const ImageBase* dst,
const ImageBase* src) {
const auto original_dst_format = dst_info.format;
diff --git a/src/video_core/texture_cache/util.h b/src/video_core/texture_cache/util.h
index d103db8ae..84aa6880d 100644
--- a/src/video_core/texture_cache/util.h
+++ b/src/video_core/texture_cache/util.h
@@ -56,6 +56,10 @@ struct OverlapResult {
SubresourceBase base, u32 up_scale = 1,
u32 down_shift = 0);
+[[nodiscard]] std::vector<ImageCopy> MakeReinterpretImageCopies(const ImageInfo& src,
+ u32 up_scale = 1,
+ u32 down_shift = 0);
+
[[nodiscard]] bool IsValidEntry(const Tegra::MemoryManager& gpu_memory, const TICEntry& config);
[[nodiscard]] std::vector<BufferImageCopy> UnswizzleImage(Tegra::MemoryManager& gpu_memory,
@@ -88,6 +92,9 @@ void SwizzleImage(Tegra::MemoryManager& gpu_memory, GPUVAddr gpu_addr, const Ima
[[nodiscard]] bool IsPitchLinearSameSize(const ImageInfo& lhs, const ImageInfo& rhs,
bool strict_size) noexcept;
+[[nodiscard]] bool IsBlockLinearSizeCompatibleBPPRelaxed(const ImageInfo& lhs, const ImageInfo& rhs,
+ u32 lhs_level, u32 rhs_level) noexcept;
+
[[nodiscard]] std::optional<OverlapResult> ResolveOverlap(const ImageInfo& new_info,
GPUVAddr gpu_addr, VAddr cpu_addr,
const ImageBase& overlap,
@@ -106,6 +113,9 @@ void SwizzleImage(Tegra::MemoryManager& gpu_memory, GPUVAddr gpu_addr, const Ima
GPUVAddr candidate_addr, RelaxedOptions options, bool broken_views,
bool native_bgr);
+[[nodiscard]] bool IsSubCopy(const ImageInfo& candidate, const ImageBase& image,
+ GPUVAddr candidate_addr);
+
void DeduceBlitImages(ImageInfo& dst_info, ImageInfo& src_info, const ImageBase* dst,
const ImageBase* src);
diff --git a/src/video_core/textures/astc.cpp b/src/video_core/textures/astc.cpp
index 4381eed1d..a68bc0d77 100644
--- a/src/video_core/textures/astc.cpp
+++ b/src/video_core/textures/astc.cpp
@@ -1571,7 +1571,7 @@ static void DecompressBlock(std::span<const u8, 16> inBuf, const u32 blockWidth,
assert(strm.GetBitsRead() + weightParams.GetPackedBitSize() == 128);
// Decode both color data and texel weight data
- u32 colorValues[32]; // Four values, two endpoints, four maximum paritions
+ u32 colorValues[32]; // Four values, two endpoints, four maximum partitions
DecodeColorValues(colorValues, colorEndpointData, colorEndpointMode, nPartitions,
colorDataBits);
diff --git a/src/video_core/textures/texture.h b/src/video_core/textures/texture.h
index 7c4553a53..7e5837b20 100644
--- a/src/video_core/textures/texture.h
+++ b/src/video_core/textures/texture.h
@@ -15,26 +15,26 @@ enum class TextureFormat : u32 {
R32G32B32 = 0x02,
R16G16B16A16 = 0x03,
R32G32 = 0x04,
- R32_B24G8 = 0x05,
+ R32B24G8 = 0x05,
ETC2_RGB = 0x06,
X8B8G8R8 = 0x07,
- A8R8G8B8 = 0x08,
+ A8B8G8R8 = 0x08,
A2B10G10R10 = 0x09,
ETC2_RGB_PTA = 0x0a,
ETC2_RGBA = 0x0b,
R16G16 = 0x0c,
- R24G8 = 0x0d,
- R8G24 = 0x0e,
+ G8R24 = 0x0d,
+ G24R8 = 0x0e,
R32 = 0x0f,
- BC6H_SFLOAT = 0x10,
- BC6H_UFLOAT = 0x11,
+ BC6H_S16 = 0x10,
+ BC6H_U16 = 0x11,
A4B4G4R4 = 0x12,
A5B5G5R1 = 0x13,
A1B5G5R5 = 0x14,
B5G6R5 = 0x15,
B6G5R5 = 0x16,
- BC7 = 0x17,
- R8G8 = 0x18,
+ BC7U = 0x17,
+ G8R8 = 0x18,
EAC = 0x19,
EACX2 = 0x1a,
R16 = 0x1b,
@@ -46,33 +46,33 @@ enum class TextureFormat : u32 {
B10G11R11 = 0x21,
G8B8G8R8 = 0x22,
B8G8R8G8 = 0x23,
- BC1_RGBA = 0x24,
- BC2 = 0x25,
- BC3 = 0x26,
- BC4 = 0x27,
- BC5 = 0x28,
- S8D24 = 0x29,
- X8D24 = 0x2a,
- D24S8 = 0x2b,
- X4V4D24__COV4R4V = 0x2c,
- X4V4D24__COV8R8V = 0x2d,
- V8D24__COV4R12V = 0x2e,
- D32 = 0x2f,
- D32S8 = 0x30,
- X8D24_X20V4S8__COV4R4V = 0x31,
- X8D24_X20V4S8__COV8R8V = 0x32,
- D32_X20V4X8__COV4R4V = 0x33,
- D32_X20V4X8__COV8R8V = 0x34,
- D32_X20V4S8__COV4R4V = 0x35,
- D32_X20V4S8__COV8R8V = 0x36,
- X8D24_X16V8S8__COV4R12V = 0x37,
- D32_X16V8X8__COV4R12V = 0x38,
- D32_X16V8S8__COV4R12V = 0x39,
- D16 = 0x3a,
- V8D24__COV8R24V = 0x3b,
- X8D24_X16V8S8__COV8R24V = 0x3c,
- D32_X16V8X8__COV8R24V = 0x3d,
- D32_X16V8S8__COV8R24V = 0x3e,
+ DXT1 = 0x24,
+ DXT23 = 0x25,
+ DXT45 = 0x26,
+ DXN1 = 0x27,
+ DXN2 = 0x28,
+ Z24S8 = 0x29,
+ X8Z24 = 0x2a,
+ S8Z24 = 0x2b,
+ X4V4Z24__COV4R4V = 0x2c,
+ X4V4Z24__COV8R8V = 0x2d,
+ V8Z24__COV4R12V = 0x2e,
+ Z32 = 0x2f,
+ Z32_X24S8 = 0x30,
+ X8Z24_X20V4S8__COV4R4V = 0x31,
+ X8Z24_X20V4S8__COV8R8V = 0x32,
+ Z32_X20V4X8__COV4R4V = 0x33,
+ Z32_X20V4X8__COV8R8V = 0x34,
+ Z32_X20V4S8__COV4R4V = 0x35,
+ Z32_X20V4S8__COV8R8V = 0x36,
+ X8Z24_X16V8S8__COV4R12V = 0x37,
+ Z32_X16V8X8__COV4R12V = 0x38,
+ Z32_X16V8S8__COV4R12V = 0x39,
+ Z16 = 0x3a,
+ V8Z24__COV8R24V = 0x3b,
+ X8Z24_X16V8S8__COV8R24V = 0x3c,
+ Z32_X16V8X8__COV8R24V = 0x3d,
+ Z32_X16V8S8__COV8R24V = 0x3e,
ASTC_2D_4X4 = 0x40,
ASTC_2D_5X5 = 0x41,
ASTC_2D_6X6 = 0x42,
diff --git a/src/video_core/vulkan_common/vulkan_device.cpp b/src/video_core/vulkan_common/vulkan_device.cpp
index 48f1a3d14..6f288b3f8 100644
--- a/src/video_core/vulkan_common/vulkan_device.cpp
+++ b/src/video_core/vulkan_common/vulkan_device.cpp
@@ -401,6 +401,12 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR
loaded_extensions.erase(VK_EXT_EXTENDED_DYNAMIC_STATE_2_EXTENSION_NAME);
}
}
+ if (extensions.extended_dynamic_state3 && is_radv) {
+ LOG_WARNING(Render_Vulkan, "RADV has broken extendedDynamicState3ColorBlendEquation");
+ features.extended_dynamic_state3.extendedDynamicState3ColorBlendEnable = false;
+ features.extended_dynamic_state3.extendedDynamicState3ColorBlendEquation = false;
+ dynamic_state3_blending = false;
+ }
if (extensions.vertex_input_dynamic_state && is_radv) {
// TODO(ameerj): Blacklist only offending driver versions
// TODO(ameerj): Confirm if RDNA1 is affected
@@ -417,7 +423,7 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR
sets_per_pool = 64;
if (is_amd_driver) {
- // AMD drivers need a higher amount of Sets per Pool in certain circunstances like in XC2.
+ // AMD drivers need a higher amount of Sets per Pool in certain circumstances like in XC2.
sets_per_pool = 96;
// Disable VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT on AMD GCN4 and lower as it is broken.
if (!features.shader_float16_int8.shaderFloat16) {
diff --git a/src/video_core/vulkan_common/vulkan_device.h b/src/video_core/vulkan_common/vulkan_device.h
index 0662a2d9f..41b5da18a 100644
--- a/src/video_core/vulkan_common/vulkan_device.h
+++ b/src/video_core/vulkan_common/vulkan_device.h
@@ -180,7 +180,7 @@ public:
~Device();
/**
- * Returns a format supported by the device for the passed requeriments.
+ * Returns a format supported by the device for the passed requirements.
* @param wanted_format The ideal format to be returned. It may not be the returned format.
* @param wanted_usage The usage that must be fulfilled even if the format is not supported.
* @param format_type Format type usage.
@@ -259,12 +259,12 @@ public:
bool ShouldBoostClocks() const;
- /// Returns uniform buffer alignment requeriment.
+ /// Returns uniform buffer alignment requirement.
VkDeviceSize GetUniformBufferAlignment() const {
return properties.properties.limits.minUniformBufferOffsetAlignment;
}
- /// Returns storage alignment requeriment.
+ /// Returns storage alignment requirement.
VkDeviceSize GetStorageBufferAlignment() const {
return properties.properties.limits.minStorageBufferOffsetAlignment;
}
@@ -656,7 +656,7 @@ private:
bool is_integrated{}; ///< Is GPU an iGPU.
bool is_virtual{}; ///< Is GPU a virtual GPU.
bool is_non_gpu{}; ///< Is SoftwareRasterizer, FPGA, non-GPU device.
- bool has_broken_cube_compatibility{}; ///< Has broken cube compatiblity bit
+ bool has_broken_cube_compatibility{}; ///< Has broken cube compatibility bit
bool has_renderdoc{}; ///< Has RenderDoc attached
bool has_nsight_graphics{}; ///< Has Nsight Graphics attached
bool supports_d24_depth{}; ///< Supports D24 depth buffers.
diff --git a/src/video_core/vulkan_common/vulkan_wrapper.h b/src/video_core/vulkan_common/vulkan_wrapper.h
index e86f661cb..4ff328a21 100644
--- a/src/video_core/vulkan_common/vulkan_wrapper.h
+++ b/src/video_core/vulkan_common/vulkan_wrapper.h
@@ -68,7 +68,7 @@ public:
constexpr Span(const Range& range) : ptr{std::data(range)}, num{std::size(range)} {}
/// Construct a span from a pointer and a size.
- /// This is inteded for subranges.
+ /// This is intended for subranges.
constexpr Span(const T* ptr_, std::size_t num_) noexcept : ptr{ptr_}, num{num_} {}
/// Returns the data pointer by the span.
@@ -390,11 +390,11 @@ public:
Handle(const Handle&) = delete;
Handle& operator=(const Handle&) = delete;
- /// Construct a handle transfering the ownership from another handle.
+ /// Construct a handle transferring the ownership from another handle.
Handle(Handle&& rhs) noexcept
: handle{std::exchange(rhs.handle, nullptr)}, owner{rhs.owner}, dld{rhs.dld} {}
- /// Assign the current handle transfering the ownership from another handle.
+ /// Assign the current handle transferring the ownership from another handle.
/// Destroys any previously held object.
Handle& operator=(Handle&& rhs) noexcept {
Release();
@@ -463,10 +463,10 @@ public:
Handle(const Handle&) = delete;
Handle& operator=(const Handle&) = delete;
- /// Construct a handle transfering ownership from another handle.
+ /// Construct a handle transferring ownership from another handle.
Handle(Handle&& rhs) noexcept : handle{std::exchange(rhs.handle, nullptr)}, dld{rhs.dld} {}
- /// Assign the current handle transfering the ownership from another handle.
+ /// Assign the current handle transferring the ownership from another handle.
/// Destroys any previously held object.
Handle& operator=(Handle&& rhs) noexcept {
Release();
@@ -533,12 +533,12 @@ public:
PoolAllocations(const PoolAllocations&) = delete;
PoolAllocations& operator=(const PoolAllocations&) = delete;
- /// Construct an allocation transfering ownership from another allocation.
+ /// Construct an allocation transferring ownership from another allocation.
PoolAllocations(PoolAllocations&& rhs) noexcept
: allocations{std::move(rhs.allocations)}, num{rhs.num}, device{rhs.device}, pool{rhs.pool},
dld{rhs.dld} {}
- /// Assign an allocation transfering ownership from another allocation.
+ /// Assign an allocation transferring ownership from another allocation.
PoolAllocations& operator=(PoolAllocations&& rhs) noexcept {
allocations = std::move(rhs.allocations);
num = rhs.num;