From 0526bf18952bc6c6877dcdc05731d34327396662 Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Mon, 26 Aug 2019 22:09:12 -0300 Subject: shader_ir/warp: Implement SHFL --- src/video_core/engines/shader_bytecode.h | 18 +++++++ src/video_core/renderer_opengl/gl_shader_cache.cpp | 3 +- .../renderer_opengl/gl_shader_decompiler.cpp | 63 +++++++++++++++++++--- .../renderer_vulkan/vk_shader_decompiler.cpp | 50 +++++++++++++++++ src/video_core/shader/decode/warp.cpp | 47 ++++++++++++++++ src/video_core/shader/node.h | 10 ++++ 6 files changed, 182 insertions(+), 9 deletions(-) diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h index 052e6d24e..a6110bd86 100644 --- a/src/video_core/engines/shader_bytecode.h +++ b/src/video_core/engines/shader_bytecode.h @@ -566,6 +566,13 @@ enum class ImageAtomicOperation : u64 { Exch = 8, }; +enum class ShuffleOperation : u64 { + Idx = 0, // shuffleNV + Up = 1, // shuffleUpNV + Down = 2, // shuffleDownNV + Bfly = 3, // shuffleXorNV +}; + union Instruction { Instruction& operator=(const Instruction& instr) { value = instr.value; @@ -599,6 +606,15 @@ union Instruction { BitField<42, 1, u64> negate_value; } vote; + union { + BitField<30, 2, ShuffleOperation> operation; + BitField<48, 3, u64> pred48; + BitField<28, 1, u64> is_index_imm; + BitField<29, 1, u64> is_mask_imm; + BitField<20, 5, u64> index_imm; + BitField<34, 13, u64> mask_imm; + } shfl; + union { BitField<8, 8, Register> gpr; BitField<20, 24, s64> offset; @@ -1542,6 +1558,7 @@ public: BRK, DEPBAR, VOTE, + SHFL, BFE_C, BFE_R, BFE_IMM, @@ -1833,6 +1850,7 @@ private: INST("111000110000----", Id::EXIT, Type::Flow, "EXIT"), INST("1111000011110---", Id::DEPBAR, Type::Synch, "DEPBAR"), INST("0101000011011---", Id::VOTE, Type::Warp, "VOTE"), + INST("1110111100010---", Id::SHFL, Type::Warp, "SHFL"), INST("1110111111011---", Id::LD_A, Type::Memory, "LD_A"), INST("1110111101001---", Id::LD_S, Type::Memory, "LD_S"), INST("1110111101000---", Id::LD_L, Type::Memory, "LD_L"), diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp index 909ccb82c..0dbc4c02f 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.cpp +++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp @@ -214,7 +214,8 @@ CachedProgram SpecializeShader(const std::string& code, const GLShader::ShaderEn std::string source = "#version 430 core\n" "#extension GL_ARB_separate_shader_objects : enable\n" "#extension GL_NV_gpu_shader5 : enable\n" - "#extension GL_NV_shader_thread_group : enable\n"; + "#extension GL_NV_shader_thread_group : enable\n" + "#extension GL_NV_shader_thread_shuffle : enable\n"; if (entries.shader_viewport_layer_array) { source += "#extension GL_ARB_shader_viewport_layer_array : enable\n"; } diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp index 137b23740..6b31ba0f2 100644 --- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp +++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp @@ -1934,8 +1934,7 @@ private: Expression BallotThread(Operation operation) { const std::string value = VisitOperand(operation, 0).AsBool(); if (!device.HasWarpIntrinsics()) { - LOG_ERROR(Render_OpenGL, - "Nvidia warp intrinsics are not available and its required by a shader"); + LOG_ERROR(Render_OpenGL, "Nvidia vote intrinsics are required by this shader"); // Stub on non-Nvidia devices by simulating all threads voting the same as the active // one. return {fmt::format("({} ? 0xFFFFFFFFU : 0U)", value), Type::Uint}; @@ -1946,8 +1945,7 @@ private: Expression Vote(Operation operation, const char* func) { const std::string value = VisitOperand(operation, 0).AsBool(); if (!device.HasWarpIntrinsics()) { - LOG_ERROR(Render_OpenGL, - "Nvidia vote intrinsics are not available and its required by a shader"); + LOG_ERROR(Render_OpenGL, "Nvidia vote intrinsics are required by this shader"); // Stub with a warp size of one. return {value, Type::Bool}; } @@ -1964,15 +1962,54 @@ private: Expression VoteEqual(Operation operation) { if (!device.HasWarpIntrinsics()) { - LOG_ERROR(Render_OpenGL, - "Nvidia vote intrinsics are not available and its required by a shader"); - // We must return true here since a stub for a theoretical warp size of 1 will always - // return an equal result for all its votes. + LOG_ERROR(Render_OpenGL, "Nvidia vote intrinsics are required by this shader"); + // We must return true here since a stub for a theoretical warp size of 1. + // This will always return an equal result across all votes. return {"true", Type::Bool}; } return Vote(operation, "allThreadsEqualNV"); } + template + Expression Shuffle(Operation operation) { + const std::string value = VisitOperand(operation, 0).AsFloat(); + if (!device.HasWarpIntrinsics()) { + LOG_ERROR(Render_OpenGL, "Nvidia shuffle intrinsics are required by this shader"); + // On a "single-thread" device we are either on the same thread or out of bounds. Both + // cases return the passed value. + return {value, Type::Float}; + } + + const std::string index = VisitOperand(operation, 1).AsUint(); + const std::string width = VisitOperand(operation, 2).AsUint(); + return {fmt::format("{}({}, {}, {})", func, value, index, width), Type::Float}; + } + + template + Expression InRangeShuffle(Operation operation) { + const std::string index = VisitOperand(operation, 0).AsUint(); + const std::string width = VisitOperand(operation, 1).AsUint(); + if (!device.HasWarpIntrinsics()) { + // On a "single-thread" device we are only in bounds when the requested index is 0. + return {fmt::format("({} == 0U)", index), Type::Bool}; + } + + const std::string in_range = code.GenerateTemporary(); + code.AddLine("bool {};", in_range); + code.AddLine("{}(0U, {}, {}, {});", func, index, width, in_range); + return {in_range, Type::Bool}; + } + + struct Func final { + Func() = delete; + ~Func() = delete; + + static constexpr std::string_view ShuffleIndexed = "shuffleNV"; + static constexpr std::string_view ShuffleUp = "shuffleUpNV"; + static constexpr std::string_view ShuffleDown = "shuffleDownNV"; + static constexpr std::string_view ShuffleButterfly = "shuffleXorNV"; + }; + static constexpr std::array operation_decompilers = { &GLSLDecompiler::Assign, @@ -2135,6 +2172,16 @@ private: &GLSLDecompiler::VoteAll, &GLSLDecompiler::VoteAny, &GLSLDecompiler::VoteEqual, + + &GLSLDecompiler::Shuffle, + &GLSLDecompiler::Shuffle, + &GLSLDecompiler::Shuffle, + &GLSLDecompiler::Shuffle, + + &GLSLDecompiler::InRangeShuffle, + &GLSLDecompiler::InRangeShuffle, + &GLSLDecompiler::InRangeShuffle, + &GLSLDecompiler::InRangeShuffle, }; static_assert(operation_decompilers.size() == static_cast(OperationCode::Amount)); diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp index b9153934e..f7fbbb6e4 100644 --- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp +++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp @@ -1127,6 +1127,46 @@ private: return {}; } + Id ShuffleIndexed(Operation) { + UNIMPLEMENTED(); + return {}; + } + + Id ShuffleUp(Operation) { + UNIMPLEMENTED(); + return {}; + } + + Id ShuffleDown(Operation) { + UNIMPLEMENTED(); + return {}; + } + + Id ShuffleButterfly(Operation) { + UNIMPLEMENTED(); + return {}; + } + + Id InRangeShuffleIndexed(Operation) { + UNIMPLEMENTED(); + return {}; + } + + Id InRangeShuffleUp(Operation) { + UNIMPLEMENTED(); + return {}; + } + + Id InRangeShuffleDown(Operation) { + UNIMPLEMENTED(); + return {}; + } + + Id InRangeShuffleButterfly(Operation) { + UNIMPLEMENTED(); + return {}; + } + Id DeclareBuiltIn(spv::BuiltIn builtin, spv::StorageClass storage, Id type, const std::string& name) { const Id id = OpVariable(type, storage); @@ -1431,6 +1471,16 @@ private: &SPIRVDecompiler::VoteAll, &SPIRVDecompiler::VoteAny, &SPIRVDecompiler::VoteEqual, + + &SPIRVDecompiler::ShuffleIndexed, + &SPIRVDecompiler::ShuffleUp, + &SPIRVDecompiler::ShuffleDown, + &SPIRVDecompiler::ShuffleButterfly, + + &SPIRVDecompiler::InRangeShuffleIndexed, + &SPIRVDecompiler::InRangeShuffleUp, + &SPIRVDecompiler::InRangeShuffleDown, + &SPIRVDecompiler::InRangeShuffleButterfly, }; static_assert(operation_decompilers.size() == static_cast(OperationCode::Amount)); diff --git a/src/video_core/shader/decode/warp.cpp b/src/video_core/shader/decode/warp.cpp index 04ca74f46..a8e481b3c 100644 --- a/src/video_core/shader/decode/warp.cpp +++ b/src/video_core/shader/decode/warp.cpp @@ -13,6 +13,7 @@ namespace VideoCommon::Shader { using Tegra::Shader::Instruction; using Tegra::Shader::OpCode; using Tegra::Shader::Pred; +using Tegra::Shader::ShuffleOperation; using Tegra::Shader::VoteOperation; namespace { @@ -44,6 +45,52 @@ u32 ShaderIR::DecodeWarp(NodeBlock& bb, u32 pc) { SetPredicate(bb, instr.vote.dest_pred, vote); break; } + case OpCode::Id::SHFL: { + Node mask = instr.shfl.is_mask_imm ? Immediate(static_cast(instr.shfl.mask_imm)) + : GetRegister(instr.gpr39); + Node width = [&] { + // Convert the obscure SHFL mask back into GL_NV_shader_thread_shuffle's width. This has + // been done reversing Nvidia's math. It won't work on all cases due to SHFL having + // different parameters that don't properly map to GLSL's interface, but it should work + // for cases emitted by Nvidia's compiler. + if (instr.shfl.operation == ShuffleOperation::Up) { + return Operation( + OperationCode::ILogicalShiftRight, + Operation(OperationCode::IAdd, std::move(mask), Immediate(-0x2000)), + Immediate(8)); + } else { + return Operation(OperationCode::ILogicalShiftRight, + Operation(OperationCode::IAdd, Immediate(0x201F), + Operation(OperationCode::INegate, std::move(mask))), + Immediate(8)); + } + }(); + + const auto [operation, in_range] = [instr]() -> std::pair { + switch (instr.shfl.operation) { + case ShuffleOperation::Idx: + return {OperationCode::ShuffleIndexed, OperationCode::InRangeShuffleIndexed}; + case ShuffleOperation::Up: + return {OperationCode::ShuffleUp, OperationCode::InRangeShuffleUp}; + case ShuffleOperation::Down: + return {OperationCode::ShuffleDown, OperationCode::InRangeShuffleDown}; + case ShuffleOperation::Bfly: + return {OperationCode::ShuffleButterfly, OperationCode::InRangeShuffleButterfly}; + } + UNREACHABLE_MSG("Invalid SHFL operation: {}", + static_cast(instr.shfl.operation.Value())); + return {}; + }(); + + // Setting the predicate before the register is intentional to avoid overwriting. + Node index = instr.shfl.is_index_imm ? Immediate(static_cast(instr.shfl.index_imm)) + : GetRegister(instr.gpr20); + SetPredicate(bb, instr.shfl.pred48, Operation(in_range, index, width)); + SetRegister( + bb, instr.gpr0, + Operation(operation, GetRegister(instr.gpr8), std::move(index), std::move(width))); + break; + } default: UNIMPLEMENTED_MSG("Unhandled warp instruction: {}", opcode->get().GetName()); break; diff --git a/src/video_core/shader/node.h b/src/video_core/shader/node.h index b47b201cf..86de7e0a3 100644 --- a/src/video_core/shader/node.h +++ b/src/video_core/shader/node.h @@ -181,6 +181,16 @@ enum class OperationCode { VoteAny, /// (bool) -> bool VoteEqual, /// (bool) -> bool + ShuffleIndexed, /// (uint value, uint index, uint width) -> uint + ShuffleUp, /// (uint value, uint index, uint width) -> uint + ShuffleDown, /// (uint value, uint index, uint width) -> uint + ShuffleButterfly, /// (uint value, uint index, uint width) -> uint + + InRangeShuffleIndexed, /// (uint index, uint width) -> bool + InRangeShuffleUp, /// (uint index, uint width) -> bool + InRangeShuffleDown, /// (uint index, uint width) -> bool + InRangeShuffleButterfly, /// (uint index, uint width) -> bool + Amount, }; -- cgit v1.2.3