diff options
-rw-r--r-- | src/common/scratch_buffer.h | 1 | ||||
-rw-r--r-- | src/common/settings.cpp | 2 | ||||
-rw-r--r-- | src/common/settings.h | 1 | ||||
-rw-r--r-- | src/video_core/renderer_opengl/gl_texture_cache.cpp | 17 | ||||
-rw-r--r-- | src/video_core/renderer_vulkan/vk_texture_cache.cpp | 7 | ||||
-rw-r--r-- | src/video_core/texture_cache/image_base.h | 3 | ||||
-rw-r--r-- | src/video_core/texture_cache/texture_cache.h | 70 | ||||
-rw-r--r-- | src/video_core/texture_cache/texture_cache_base.h | 16 | ||||
-rw-r--r-- | src/video_core/textures/astc.cpp | 4 | ||||
-rw-r--r-- | src/yuzu/configuration/config.cpp | 2 | ||||
-rw-r--r-- | src/yuzu/configuration/configure_graphics_advanced.cpp | 7 | ||||
-rw-r--r-- | src/yuzu/configuration/configure_graphics_advanced.h | 1 | ||||
-rw-r--r-- | src/yuzu/configuration/configure_graphics_advanced.ui | 10 | ||||
-rw-r--r-- | src/yuzu_cmd/config.cpp | 1 | ||||
-rw-r--r-- | src/yuzu_cmd/default_ini.h | 4 |
15 files changed, 138 insertions, 8 deletions
diff --git a/src/common/scratch_buffer.h b/src/common/scratch_buffer.h index 1245a5086..26d4e76dc 100644 --- a/src/common/scratch_buffer.h +++ b/src/common/scratch_buffer.h @@ -23,6 +23,7 @@ public: buffer{Common::make_unique_for_overwrite<T[]>(initial_capacity)} {} ~ScratchBuffer() = default; + ScratchBuffer(ScratchBuffer&&) = default; /// This will only grow the buffer's capacity if size is greater than the current capacity. /// The previously held data will remain intact. diff --git a/src/common/settings.cpp b/src/common/settings.cpp index 749ac213f..84955030b 100644 --- a/src/common/settings.cpp +++ b/src/common/settings.cpp @@ -59,6 +59,7 @@ void LogSettings() { values.use_asynchronous_gpu_emulation.GetValue()); log_setting("Renderer_NvdecEmulation", values.nvdec_emulation.GetValue()); log_setting("Renderer_AccelerateASTC", values.accelerate_astc.GetValue()); + log_setting("Renderer_AsyncASTC", values.async_astc.GetValue()); log_setting("Renderer_UseVsync", values.use_vsync.GetValue()); log_setting("Renderer_ShaderBackend", values.shader_backend.GetValue()); log_setting("Renderer_UseAsynchronousShaders", values.use_asynchronous_shaders.GetValue()); @@ -219,6 +220,7 @@ void RestoreGlobalState(bool is_powered_on) { values.use_asynchronous_gpu_emulation.SetGlobal(true); values.nvdec_emulation.SetGlobal(true); values.accelerate_astc.SetGlobal(true); + values.async_astc.SetGlobal(true); values.use_vsync.SetGlobal(true); values.shader_backend.SetGlobal(true); values.use_asynchronous_shaders.SetGlobal(true); diff --git a/src/common/settings.h b/src/common/settings.h index 9fe764e86..4d0694b7d 100644 --- a/src/common/settings.h +++ b/src/common/settings.h @@ -453,6 +453,7 @@ struct Values { SwitchableSetting<bool> use_asynchronous_gpu_emulation{true, "use_asynchronous_gpu_emulation"}; SwitchableSetting<NvdecEmulation> nvdec_emulation{NvdecEmulation::GPU, "nvdec_emulation"}; SwitchableSetting<bool> accelerate_astc{true, "accelerate_astc"}; + SwitchableSetting<bool> async_astc{false, "async_astc"}; SwitchableSetting<bool> use_vsync{true, "use_vsync"}; SwitchableSetting<ShaderBackend, true> shader_backend{ShaderBackend::GLSL, ShaderBackend::GLSL, ShaderBackend::SPIRV, "shader_backend"}; diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index eb6e43a08..b047e7b3d 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp @@ -228,8 +228,9 @@ void ApplySwizzle(GLuint handle, PixelFormat format, std::array<SwizzleSource, 4 [[nodiscard]] bool CanBeAccelerated(const TextureCacheRuntime& runtime, const VideoCommon::ImageInfo& info) { - if (IsPixelFormatASTC(info.format)) { - return !runtime.HasNativeASTC() && Settings::values.accelerate_astc.GetValue(); + if (IsPixelFormatASTC(info.format) && !runtime.HasNativeASTC()) { + return Settings::values.accelerate_astc.GetValue() && + !Settings::values.async_astc.GetValue(); } // Disable other accelerated uploads for now as they don't implement swizzled uploads return false; @@ -258,6 +259,14 @@ void ApplySwizzle(GLuint handle, PixelFormat format, std::array<SwizzleSource, 4 return format_info.compatibility_class == store_class; } +[[nodiscard]] bool CanBeDecodedAsync(const TextureCacheRuntime& runtime, + const VideoCommon::ImageInfo& info) { + if (IsPixelFormatASTC(info.format) && !runtime.HasNativeASTC()) { + return Settings::values.async_astc.GetValue(); + } + return false; +} + [[nodiscard]] CopyOrigin MakeCopyOrigin(VideoCommon::Offset3D offset, VideoCommon::SubresourceLayers subresource, GLenum target) { switch (target) { @@ -721,7 +730,9 @@ std::optional<size_t> TextureCacheRuntime::StagingBuffers::FindBuffer(size_t req Image::Image(TextureCacheRuntime& runtime_, const VideoCommon::ImageInfo& info_, GPUVAddr gpu_addr_, VAddr cpu_addr_) : VideoCommon::ImageBase(info_, gpu_addr_, cpu_addr_), runtime{&runtime_} { - if (CanBeAccelerated(*runtime, info)) { + if (CanBeDecodedAsync(*runtime, info)) { + flags |= ImageFlagBits::AsynchronousDecode; + } else if (CanBeAccelerated(*runtime, info)) { flags |= ImageFlagBits::AcceleratedUpload; } if (IsConverted(runtime->device, info.format, info.type)) { diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp index 9b85dfb5e..80adb70eb 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -1256,11 +1256,12 @@ Image::Image(TextureCacheRuntime& runtime_, const ImageInfo& info_, GPUVAddr gpu commit(runtime_.memory_allocator.Commit(original_image, MemoryUsage::DeviceLocal)), aspect_mask(ImageAspectMask(info.format)) { if (IsPixelFormatASTC(info.format) && !runtime->device.IsOptimalAstcSupported()) { - if (Settings::values.accelerate_astc.GetValue()) { + if (Settings::values.async_astc.GetValue()) { + flags |= VideoCommon::ImageFlagBits::AsynchronousDecode; + } else if (Settings::values.accelerate_astc.GetValue()) { flags |= VideoCommon::ImageFlagBits::AcceleratedUpload; - } else { - flags |= VideoCommon::ImageFlagBits::Converted; } + flags |= VideoCommon::ImageFlagBits::Converted; flags |= VideoCommon::ImageFlagBits::CostlyLoad; } if (runtime->device.HasDebuggingToolAttached()) { diff --git a/src/video_core/texture_cache/image_base.h b/src/video_core/texture_cache/image_base.h index 620565684..e8fa592d2 100644 --- a/src/video_core/texture_cache/image_base.h +++ b/src/video_core/texture_cache/image_base.h @@ -38,6 +38,9 @@ enum class ImageFlagBits : u32 { Rescaled = 1 << 13, CheckingRescalable = 1 << 14, IsRescalable = 1 << 15, + + AsynchronousDecode = 1 << 16, + IsDecoding = 1 << 17, ///< Is currently being decoded asynchornously. }; DECLARE_ENUM_FLAG_OPERATORS(ImageFlagBits) diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index 3e2cbb0b0..9dd152fbe 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -85,6 +85,11 @@ void TextureCache<P>::RunGarbageCollector() { } --num_iterations; auto& image = slot_images[image_id]; + if (True(image.flags & ImageFlagBits::IsDecoding)) { + // This image is still being decoded, deleting it will invalidate the slot + // used by the async decoder thread. + return false; + } const bool must_download = image.IsSafeDownload() && False(image.flags & ImageFlagBits::BadOverlap); if (!high_priority_mode && @@ -133,6 +138,8 @@ void TextureCache<P>::TickFrame() { sentenced_images.Tick(); sentenced_framebuffers.Tick(); sentenced_image_view.Tick(); + TickAsyncDecode(); + runtime.TickFrame(); critical_gc = 0; ++frame_tick; @@ -777,6 +784,10 @@ void TextureCache<P>::RefreshContents(Image& image, ImageId image_id) { LOG_WARNING(HW_GPU, "MSAA image uploads are not implemented"); return; } + if (True(image.flags & ImageFlagBits::AsynchronousDecode)) { + QueueAsyncDecode(image, image_id); + return; + } auto staging = runtime.UploadStagingBuffer(MapSizeBytes(image)); UploadImageContents(image, staging); runtime.InsertUploadMemoryBarrier(); @@ -990,6 +1001,65 @@ u64 TextureCache<P>::GetScaledImageSizeBytes(const ImageBase& image) { } template <class P> +void TextureCache<P>::QueueAsyncDecode(Image& image, ImageId image_id) { + UNIMPLEMENTED_IF(False(image.flags & ImageFlagBits::Converted)); + LOG_INFO(HW_GPU, "Queuing async texture decode"); + + image.flags |= ImageFlagBits::IsDecoding; + auto decode = std::make_unique<AsyncDecodeContext>(); + auto* decode_ptr = decode.get(); + decode->image_id = image_id; + async_decodes.push_back(std::move(decode)); + + Common::ScratchBuffer<u8> local_unswizzle_data_buffer(image.unswizzled_size_bytes); + const size_t guest_size_bytes = image.guest_size_bytes; + swizzle_data_buffer.resize_destructive(guest_size_bytes); + gpu_memory->ReadBlockUnsafe(image.gpu_addr, swizzle_data_buffer.data(), guest_size_bytes); + auto copies = UnswizzleImage(*gpu_memory, image.gpu_addr, image.info, swizzle_data_buffer, + local_unswizzle_data_buffer); + const size_t out_size = MapSizeBytes(image); + + auto func = [out_size, copies, info = image.info, + input = std::move(local_unswizzle_data_buffer), + async_decode = decode_ptr]() mutable { + async_decode->decoded_data.resize_destructive(out_size); + std::span copies_span{copies.data(), copies.size()}; + ConvertImage(input, info, async_decode->decoded_data, copies_span); + + // TODO: Do we need this lock? + std::unique_lock lock{async_decode->mutex}; + async_decode->copies = std::move(copies); + async_decode->complete = true; + }; + texture_decode_worker.QueueWork(std::move(func)); +} + +template <class P> +void TextureCache<P>::TickAsyncDecode() { + bool has_uploads{}; + auto i = async_decodes.begin(); + while (i != async_decodes.end()) { + auto* async_decode = i->get(); + std::unique_lock lock{async_decode->mutex}; + if (!async_decode->complete) { + ++i; + continue; + } + Image& image = slot_images[async_decode->image_id]; + auto staging = runtime.UploadStagingBuffer(MapSizeBytes(image)); + std::memcpy(staging.mapped_span.data(), async_decode->decoded_data.data(), + async_decode->decoded_data.size()); + image.UploadMemory(staging, async_decode->copies); + image.flags &= ~ImageFlagBits::IsDecoding; + has_uploads = true; + i = async_decodes.erase(i); + } + if (has_uploads) { + runtime.InsertUploadMemoryBarrier(); + } +} + +template <class P> bool TextureCache<P>::ScaleUp(Image& image) { const bool has_copy = image.HasScaled(); const bool rescaled = image.ScaleUp(); diff --git a/src/video_core/texture_cache/texture_cache_base.h b/src/video_core/texture_cache/texture_cache_base.h index 485eaabaa..013836933 100644 --- a/src/video_core/texture_cache/texture_cache_base.h +++ b/src/video_core/texture_cache/texture_cache_base.h @@ -3,6 +3,7 @@ #pragma once +#include <atomic> #include <deque> #include <limits> #include <mutex> @@ -18,6 +19,7 @@ #include "common/lru_cache.h" #include "common/polyfill_ranges.h" #include "common/scratch_buffer.h" +#include "common/thread_worker.h" #include "video_core/compatible_formats.h" #include "video_core/control/channel_state_cache.h" #include "video_core/delayed_destruction_ring.h" @@ -54,6 +56,14 @@ struct ImageViewInOut { ImageViewId id{}; }; +struct AsyncDecodeContext { + ImageId image_id; + Common::ScratchBuffer<u8> decoded_data; + std::vector<BufferImageCopy> copies; + std::mutex mutex; + std::atomic_bool complete; +}; + using TextureCacheGPUMap = std::unordered_map<u64, std::vector<ImageId>, Common::IdentityHash<u64>>; class TextureCacheChannelInfo : public ChannelInfo { @@ -377,6 +387,9 @@ private: bool ScaleDown(Image& image); u64 GetScaledImageSizeBytes(const ImageBase& image); + void QueueAsyncDecode(Image& image, ImageId image_id); + void TickAsyncDecode(); + Runtime& runtime; VideoCore::RasterizerInterface& rasterizer; @@ -430,6 +443,9 @@ private: u64 modification_tick = 0; u64 frame_tick = 0; + + Common::ThreadWorker texture_decode_worker{1, "TextureDecoder"}; + std::vector<std::unique_ptr<AsyncDecodeContext>> async_decodes; }; } // namespace VideoCommon diff --git a/src/video_core/textures/astc.cpp b/src/video_core/textures/astc.cpp index e8d7c7863..4381eed1d 100644 --- a/src/video_core/textures/astc.cpp +++ b/src/video_core/textures/astc.cpp @@ -1656,8 +1656,8 @@ void Decompress(std::span<const uint8_t> data, uint32_t width, uint32_t height, const u32 rows = Common::DivideUp(height, block_height); const u32 cols = Common::DivideUp(width, block_width); - Common::ThreadWorker workers{std::max(std::thread::hardware_concurrency(), 2U) / 2, - "ASTCDecompress"}; + static Common::ThreadWorker workers{std::max(std::thread::hardware_concurrency(), 2U) / 2, + "ASTCDecompress"}; for (u32 z = 0; z < depth; ++z) { const u32 depth_offset = z * height * width * 4; diff --git a/src/yuzu/configuration/config.cpp b/src/yuzu/configuration/config.cpp index f8fae7416..4dad83b75 100644 --- a/src/yuzu/configuration/config.cpp +++ b/src/yuzu/configuration/config.cpp @@ -702,6 +702,7 @@ void Config::ReadRendererValues() { ReadGlobalSetting(Settings::values.use_asynchronous_gpu_emulation); ReadGlobalSetting(Settings::values.nvdec_emulation); ReadGlobalSetting(Settings::values.accelerate_astc); + ReadGlobalSetting(Settings::values.async_astc); ReadGlobalSetting(Settings::values.use_vsync); ReadGlobalSetting(Settings::values.shader_backend); ReadGlobalSetting(Settings::values.use_asynchronous_shaders); @@ -1343,6 +1344,7 @@ void Config::SaveRendererValues() { static_cast<u32>(Settings::values.nvdec_emulation.GetDefault()), Settings::values.nvdec_emulation.UsingGlobal()); WriteGlobalSetting(Settings::values.accelerate_astc); + WriteGlobalSetting(Settings::values.async_astc); WriteGlobalSetting(Settings::values.use_vsync); WriteSetting(QString::fromStdString(Settings::values.shader_backend.GetLabel()), static_cast<u32>(Settings::values.shader_backend.GetValue(global)), diff --git a/src/yuzu/configuration/configure_graphics_advanced.cpp b/src/yuzu/configuration/configure_graphics_advanced.cpp index 7ab5d5bf5..59fb1b334 100644 --- a/src/yuzu/configuration/configure_graphics_advanced.cpp +++ b/src/yuzu/configuration/configure_graphics_advanced.cpp @@ -23,11 +23,13 @@ void ConfigureGraphicsAdvanced::SetConfiguration() { const bool runtime_lock = !system.IsPoweredOn(); ui->use_vsync->setEnabled(runtime_lock); ui->renderer_force_max_clock->setEnabled(runtime_lock); + ui->async_astc->setEnabled(runtime_lock); ui->use_asynchronous_shaders->setEnabled(runtime_lock); ui->anisotropic_filtering_combobox->setEnabled(runtime_lock); ui->renderer_force_max_clock->setChecked(Settings::values.renderer_force_max_clock.GetValue()); ui->use_vsync->setChecked(Settings::values.use_vsync.GetValue()); + ui->async_astc->setChecked(Settings::values.async_astc.GetValue()); ui->use_asynchronous_shaders->setChecked(Settings::values.use_asynchronous_shaders.GetValue()); ui->use_fast_gpu_time->setChecked(Settings::values.use_fast_gpu_time.GetValue()); ui->use_pessimistic_flushes->setChecked(Settings::values.use_pessimistic_flushes.GetValue()); @@ -58,6 +60,8 @@ void ConfigureGraphicsAdvanced::ApplyConfiguration() { ConfigurationShared::ApplyPerGameSetting(&Settings::values.max_anisotropy, ui->anisotropic_filtering_combobox); ConfigurationShared::ApplyPerGameSetting(&Settings::values.use_vsync, ui->use_vsync, use_vsync); + ConfigurationShared::ApplyPerGameSetting(&Settings::values.async_astc, ui->async_astc, + async_astc); ConfigurationShared::ApplyPerGameSetting(&Settings::values.use_asynchronous_shaders, ui->use_asynchronous_shaders, use_asynchronous_shaders); @@ -89,6 +93,7 @@ void ConfigureGraphicsAdvanced::SetupPerGameUI() { ui->renderer_force_max_clock->setEnabled( Settings::values.renderer_force_max_clock.UsingGlobal()); ui->use_vsync->setEnabled(Settings::values.use_vsync.UsingGlobal()); + ui->async_astc->setEnabled(Settings::values.async_astc.UsingGlobal()); ui->use_asynchronous_shaders->setEnabled( Settings::values.use_asynchronous_shaders.UsingGlobal()); ui->use_fast_gpu_time->setEnabled(Settings::values.use_fast_gpu_time.UsingGlobal()); @@ -106,6 +111,8 @@ void ConfigureGraphicsAdvanced::SetupPerGameUI() { Settings::values.renderer_force_max_clock, renderer_force_max_clock); ConfigurationShared::SetColoredTristate(ui->use_vsync, Settings::values.use_vsync, use_vsync); + ConfigurationShared::SetColoredTristate(ui->async_astc, Settings::values.async_astc, + async_astc); ConfigurationShared::SetColoredTristate(ui->use_asynchronous_shaders, Settings::values.use_asynchronous_shaders, use_asynchronous_shaders); diff --git a/src/yuzu/configuration/configure_graphics_advanced.h b/src/yuzu/configuration/configure_graphics_advanced.h index df557d585..bf1b04749 100644 --- a/src/yuzu/configuration/configure_graphics_advanced.h +++ b/src/yuzu/configuration/configure_graphics_advanced.h @@ -38,6 +38,7 @@ private: ConfigurationShared::CheckState renderer_force_max_clock; ConfigurationShared::CheckState use_vsync; + ConfigurationShared::CheckState async_astc; ConfigurationShared::CheckState use_asynchronous_shaders; ConfigurationShared::CheckState use_fast_gpu_time; ConfigurationShared::CheckState use_pessimistic_flushes; diff --git a/src/yuzu/configuration/configure_graphics_advanced.ui b/src/yuzu/configuration/configure_graphics_advanced.ui index 061885e30..a7dbdc18c 100644 --- a/src/yuzu/configuration/configure_graphics_advanced.ui +++ b/src/yuzu/configuration/configure_graphics_advanced.ui @@ -90,6 +90,16 @@ </widget> </item> <item> + <widget class="QCheckBox" name="async_astc"> + <property name="toolTip"> + <string>Enables asynchronous ASTC texture decoding, which may reduce load time stutter. This feature is experimental.</string> + </property> + <property name="text"> + <string>Decode ASTC textures asynchronously (Hack)</string> + </property> + </widget> + </item> + <item> <widget class="QCheckBox" name="use_asynchronous_shaders"> <property name="toolTip"> <string>Enables asynchronous shader compilation, which may reduce shader stutter. This feature is experimental.</string> diff --git a/src/yuzu_cmd/config.cpp b/src/yuzu_cmd/config.cpp index 3b6dce296..464da3231 100644 --- a/src/yuzu_cmd/config.cpp +++ b/src/yuzu_cmd/config.cpp @@ -324,6 +324,7 @@ void Config::ReadValues() { ReadSetting("Renderer", Settings::values.use_asynchronous_shaders); ReadSetting("Renderer", Settings::values.nvdec_emulation); ReadSetting("Renderer", Settings::values.accelerate_astc); + ReadSetting("Renderer", Settings::values.async_astc); ReadSetting("Renderer", Settings::values.use_fast_gpu_time); ReadSetting("Renderer", Settings::values.use_pessimistic_flushes); ReadSetting("Renderer", Settings::values.use_vulkan_driver_pipeline_cache); diff --git a/src/yuzu_cmd/default_ini.h b/src/yuzu_cmd/default_ini.h index cf3cc4c4e..20e403400 100644 --- a/src/yuzu_cmd/default_ini.h +++ b/src/yuzu_cmd/default_ini.h @@ -342,6 +342,10 @@ nvdec_emulation = # 0: Off, 1 (default): On accelerate_astc = +# Decode ASTC textures asynchronously. +# 0 (default): Off, 1: On +async_astc = + # Turns on the speed limiter, which will limit the emulation speed to the desired speed limit value # 0: Off, 1: On (default) use_speed_limit = |