From 4f5d8e434278cd5999bf21e91f0923d55ec8d52b Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Tue, 12 Nov 2019 23:26:56 -0300 Subject: gl_shader_cache: Specialize shader workgroup Drop the usage of ARB_compute_variable_group_size and specialize compute shaders instead. This permits compute to run on AMD and Intel proprietary drivers. --- src/video_core/engines/kepler_compute.h | 2 +- src/video_core/renderer_opengl/gl_rasterizer.cpp | 21 +++----- src/video_core/renderer_opengl/gl_shader_cache.cpp | 63 ++++++++++------------ src/video_core/renderer_opengl/gl_shader_cache.h | 6 +-- .../renderer_opengl/gl_shader_disk_cache.cpp | 4 +- .../renderer_opengl/gl_shader_disk_cache.h | 46 +++++++++++----- 6 files changed, 74 insertions(+), 68 deletions(-) (limited to 'src') diff --git a/src/video_core/engines/kepler_compute.h b/src/video_core/engines/kepler_compute.h index 5259d92bd..bd49c6627 100644 --- a/src/video_core/engines/kepler_compute.h +++ b/src/video_core/engines/kepler_compute.h @@ -140,7 +140,7 @@ public: INSERT_PADDING_WORDS(0x3); - BitField<0, 16, u32> shared_alloc; + BitField<0, 18, u32> shared_alloc; BitField<16, 16, u32> block_dim_x; union { diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index b76de71ec..bd4e5f6e3 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -273,8 +273,8 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) { SetupDrawGlobalMemory(stage, shader); SetupDrawTextures(stage, shader, base_bindings); - const ProgramVariant variant{base_bindings, primitive_mode}; - const auto [program_handle, next_bindings] = shader->GetProgramHandle(variant); + const ProgramVariant variant(base_bindings, primitive_mode); + const auto [program_handle, next_bindings] = shader->GetHandle(variant); switch (program) { case Maxwell::ShaderProgram::VertexA: @@ -725,18 +725,14 @@ bool RasterizerOpenGL::DrawMultiBatch(bool is_indexed) { } void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) { - if (!GLAD_GL_ARB_compute_variable_group_size) { - LOG_ERROR(Render_OpenGL, "Compute is currently not supported on this device due to the " - "lack of GL_ARB_compute_variable_group_size"); - return; - } - auto kernel = shader_cache.GetComputeKernel(code_addr); SetupComputeTextures(kernel); SetupComputeImages(kernel); - const auto [program, next_bindings] = kernel->GetProgramHandle({}); - state.draw.shader_program = program; + const auto& launch_desc = system.GPU().KeplerCompute().launch_description; + const ProgramVariant variant(launch_desc.block_dim_x, launch_desc.block_dim_y, + launch_desc.block_dim_z); + std::tie(state.draw.shader_program, std::ignore) = kernel->GetHandle(variant); state.draw.program_pipeline = 0; const std::size_t buffer_size = @@ -760,10 +756,7 @@ void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) { state.ApplyShaderProgram(); state.ApplyProgramPipeline(); - const auto& launch_desc = system.GPU().KeplerCompute().launch_description; - glDispatchComputeGroupSizeARB(launch_desc.grid_dim_x, launch_desc.grid_dim_y, - launch_desc.grid_dim_z, launch_desc.block_dim_x, - launch_desc.block_dim_y, launch_desc.block_dim_z); + glDispatchCompute(launch_desc.grid_dim_x, launch_desc.grid_dim_y, launch_desc.grid_dim_z); } void RasterizerOpenGL::FlushAll() {} diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp index 7ce06a978..a5789b6d3 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.cpp +++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp @@ -255,7 +255,7 @@ void FillLocker(ConstBufferLocker& locker, const ShaderDiskCacheUsage& usage) { CachedProgram BuildShader(const Device& device, u64 unique_identifier, ProgramType program_type, const ProgramCode& program_code, const ProgramCode& program_code_b, - const ProgramVariant& variant, ConstBufferLocker& locker, + ConstBufferLocker& locker, const ProgramVariant& variant, bool hint_retrievable = false) { LOG_INFO(Render_OpenGL, "called. {}", GetShaderId(unique_identifier, program_type)); @@ -268,17 +268,11 @@ CachedProgram BuildShader(const Device& device, u64 unique_identifier, ProgramTy } const auto entries = GLShader::GetEntries(ir); - auto base_bindings{variant.base_bindings}; - const auto primitive_mode{variant.primitive_mode}; - std::string source = fmt::format(R"(// {} #version 430 core #extension GL_ARB_separate_shader_objects : enable )", GetShaderId(unique_identifier, program_type)); - if (is_compute) { - source += "#extension GL_ARB_compute_variable_group_size : require\n"; - } if (device.HasShaderBallot()) { source += "#extension GL_ARB_shader_ballot : require\n"; } @@ -295,6 +289,7 @@ CachedProgram BuildShader(const Device& device, u64 unique_identifier, ProgramTy } source += '\n'; + auto base_bindings = variant.base_bindings; if (!is_compute) { source += fmt::format("#define EMULATION_UBO_BINDING {}\n", base_bindings.cbuf++); } @@ -318,13 +313,15 @@ CachedProgram BuildShader(const Device& device, u64 unique_identifier, ProgramTy if (program_type == ProgramType::Geometry) { const auto [glsl_topology, debug_name, max_vertices] = - GetPrimitiveDescription(primitive_mode); + GetPrimitiveDescription(variant.primitive_mode); - source += "layout (" + std::string(glsl_topology) + ") in;\n\n"; - source += "#define MAX_VERTEX_INPUT " + std::to_string(max_vertices) + '\n'; + source += fmt::format("layout ({}) in;\n\n", glsl_topology); + source += fmt::format("#define MAX_VERTEX_INPUT {}\n", max_vertices); } if (program_type == ProgramType::Compute) { - source += "layout (local_size_variable) in;\n"; + source += + fmt::format("layout (local_size_x = {}, local_size_y = {}, local_size_z = {}) in;\n", + variant.block_x, variant.block_y, variant.block_z); } source += '\n'; @@ -422,58 +419,53 @@ Shader CachedShader::CreateFromCache(const ShaderParameters& params, unspecialized.code_b)); } -std::tuple CachedShader::GetProgramHandle(const ProgramVariant& variant) { - UpdateVariant(); +std::tuple CachedShader::GetHandle(const ProgramVariant& variant) { + EnsureValidLockerVariant(); - const auto [entry, is_cache_miss] = curr_variant->programs.try_emplace(variant); + const auto [entry, is_cache_miss] = curr_locker_variant->programs.try_emplace(variant); auto& program = entry->second; if (is_cache_miss) { program = BuildShader(device, unique_identifier, program_type, program_code, program_code_b, - variant, *curr_variant->locker); - disk_cache.SaveUsage(GetUsage(variant, *curr_variant->locker)); + *curr_locker_variant->locker, variant); + disk_cache.SaveUsage(GetUsage(variant, *curr_locker_variant->locker)); LabelGLObject(GL_PROGRAM, program->handle, cpu_addr); } auto base_bindings = variant.base_bindings; base_bindings.cbuf += static_cast(entries.const_buffers.size()); - if (program_type != ProgramType::Compute) { - base_bindings.cbuf += STAGE_RESERVED_UBOS; - } + base_bindings.cbuf += STAGE_RESERVED_UBOS; base_bindings.gmem += static_cast(entries.global_memory_entries.size()); base_bindings.sampler += static_cast(entries.samplers.size()); return {program->handle, base_bindings}; } -void CachedShader::UpdateVariant() { - if (curr_variant && !curr_variant->locker->IsConsistent()) { - curr_variant = nullptr; +bool CachedShader::EnsureValidLockerVariant() { + const auto previous_variant = curr_locker_variant; + if (curr_locker_variant && !curr_locker_variant->locker->IsConsistent()) { + curr_locker_variant = nullptr; } - if (!curr_variant) { + if (!curr_locker_variant) { for (auto& variant : locker_variants) { if (variant->locker->IsConsistent()) { - curr_variant = variant.get(); + curr_locker_variant = variant.get(); } } } - if (!curr_variant) { + if (!curr_locker_variant) { auto& new_variant = locker_variants.emplace_back(); new_variant = std::make_unique(); new_variant->locker = MakeLocker(system, program_type); - curr_variant = new_variant.get(); + curr_locker_variant = new_variant.get(); } + return previous_variant == curr_locker_variant; } ShaderDiskCacheUsage CachedShader::GetUsage(const ProgramVariant& variant, const ConstBufferLocker& locker) const { - ShaderDiskCacheUsage usage; - usage.unique_identifier = unique_identifier; - usage.variant = variant; - usage.keys = locker.GetKeys(); - usage.bound_samplers = locker.GetBoundSamplers(); - usage.bindless_samplers = locker.GetBindlessSamplers(); - return usage; + return ShaderDiskCacheUsage{unique_identifier, variant, locker.GetKeys(), + locker.GetBoundSamplers(), locker.GetBindlessSamplers()}; } ShaderCacheOpenGL::ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System& system, @@ -534,9 +526,10 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading, if (!shader) { auto locker{MakeLocker(system, unspecialized.program_type)}; FillLocker(*locker, usage); + shader = BuildShader(device, usage.unique_identifier, unspecialized.program_type, - unspecialized.code, unspecialized.code_b, usage.variant, - *locker, true); + unspecialized.code, unspecialized.code_b, *locker, + usage.variant, true); } std::scoped_lock lock{mutex}; diff --git a/src/video_core/renderer_opengl/gl_shader_cache.h b/src/video_core/renderer_opengl/gl_shader_cache.h index 6bd7c9cf1..795b05a19 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.h +++ b/src/video_core/renderer_opengl/gl_shader_cache.h @@ -86,7 +86,7 @@ public: } /// Gets the GL program handle for the shader - std::tuple GetProgramHandle(const ProgramVariant& variant); + std::tuple GetHandle(const ProgramVariant& variant); private: struct LockerVariant { @@ -98,7 +98,7 @@ private: GLShader::ShaderEntries entries, ProgramCode program_code, ProgramCode program_code_b); - void UpdateVariant(); + bool EnsureValidLockerVariant(); ShaderDiskCacheUsage GetUsage(const ProgramVariant& variant, const VideoCommon::Shader::ConstBufferLocker& locker) const; @@ -117,7 +117,7 @@ private: ProgramCode program_code; ProgramCode program_code_b; - LockerVariant* curr_variant = nullptr; + LockerVariant* curr_locker_variant = nullptr; std::vector> locker_variants; }; diff --git a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp index 3f4daf28d..9156f180a 100644 --- a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp +++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp @@ -52,11 +52,11 @@ struct BindlessSamplerKey { Tegra::Engines::SamplerDescriptor sampler{}; }; -constexpr u32 NativeVersion = 6; +constexpr u32 NativeVersion = 7; // Making sure sizes doesn't change by accident static_assert(sizeof(BaseBindings) == 16); -static_assert(sizeof(ProgramVariant) == 20); +static_assert(sizeof(ProgramVariant) == 28); ShaderCacheVersionHash GetShaderCacheVersionHash() { ShaderCacheVersionHash hash{}; diff --git a/src/video_core/renderer_opengl/gl_shader_disk_cache.h b/src/video_core/renderer_opengl/gl_shader_disk_cache.h index 55311dc6d..4c7ca004d 100644 --- a/src/video_core/renderer_opengl/gl_shader_disk_cache.h +++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.h @@ -44,32 +44,49 @@ struct BaseBindings { u32 sampler{}; u32 image{}; - bool operator==(const BaseBindings& rhs) const { + bool operator==(const BaseBindings& rhs) const noexcept { return std::tie(cbuf, gmem, sampler, image) == std::tie(rhs.cbuf, rhs.gmem, rhs.sampler, rhs.image); } - bool operator!=(const BaseBindings& rhs) const { + bool operator!=(const BaseBindings& rhs) const noexcept { return !operator==(rhs); } }; static_assert(std::is_trivially_copyable_v); -/// Describes the different variants a single program can be compiled. -struct ProgramVariant { - BaseBindings base_bindings; +/// Describes the different variants a program can be compiled with. +struct ProgramVariant final { + ProgramVariant() = default; + + /// Graphics constructor. + explicit constexpr ProgramVariant(BaseBindings base_bindings, GLenum primitive_mode) noexcept + : base_bindings{base_bindings}, primitive_mode{primitive_mode} {} + + /// Compute constructor. + explicit constexpr ProgramVariant(u32 block_x, u32 block_y, u32 block_z) noexcept + : block_x{block_x}, block_y{static_cast(block_y)}, block_z{static_cast(block_z)} { + } + + // Graphics specific parameters. + BaseBindings base_bindings{}; GLenum primitive_mode{}; - bool operator==(const ProgramVariant& rhs) const { - return std::tie(base_bindings, primitive_mode) == - std::tie(rhs.base_bindings, rhs.primitive_mode); + // Compute specific parameters. + u32 block_x{}; + u16 block_y{}; + u16 block_z{}; + + bool operator==(const ProgramVariant& rhs) const noexcept { + return std::tie(base_bindings, primitive_mode, block_x, block_y, block_z) == + std::tie(rhs.base_bindings, rhs.primitive_mode, rhs.block_x, rhs.block_y, + rhs.block_z); } - bool operator!=(const ProgramVariant& rhs) const { + bool operator!=(const ProgramVariant& rhs) const noexcept { return !operator==(rhs); } }; - static_assert(std::is_trivially_copyable_v); /// Describes how a shader is used. @@ -108,8 +125,11 @@ struct hash { template <> struct hash { std::size_t operator()(const OpenGL::ProgramVariant& variant) const noexcept { - return std::hash()(variant.base_bindings) ^ - (static_cast(variant.primitive_mode) << 6); + return std::hash{}(variant.base_bindings) ^ + (static_cast(variant.primitive_mode) << 6) ^ + static_cast(variant.block_x) ^ + (static_cast(variant.block_y) << 32) ^ + (static_cast(variant.block_z) << 48); } }; @@ -117,7 +137,7 @@ template <> struct hash { std::size_t operator()(const OpenGL::ShaderDiskCacheUsage& usage) const noexcept { return static_cast(usage.unique_identifier) ^ - std::hash()(usage.variant); + std::hash{}(usage.variant); } }; -- cgit v1.2.3