diff options
Diffstat (limited to 'src/video_core/shader/decode')
-rw-r--r-- | src/video_core/shader/decode/arithmetic.cpp | 11 | ||||
-rw-r--r-- | src/video_core/shader/decode/arithmetic_half.cpp | 4 | ||||
-rw-r--r-- | src/video_core/shader/decode/arithmetic_half_immediate.cpp | 8 | ||||
-rw-r--r-- | src/video_core/shader/decode/arithmetic_integer.cpp | 19 | ||||
-rw-r--r-- | src/video_core/shader/decode/conversion.cpp | 15 | ||||
-rw-r--r-- | src/video_core/shader/decode/ffma.cpp | 4 | ||||
-rw-r--r-- | src/video_core/shader/decode/half_set.cpp | 4 | ||||
-rw-r--r-- | src/video_core/shader/decode/half_set_predicate.cpp | 4 | ||||
-rw-r--r-- | src/video_core/shader/decode/memory.cpp | 72 | ||||
-rw-r--r-- | src/video_core/shader/decode/other.cpp | 10 | ||||
-rw-r--r-- | src/video_core/shader/decode/register_set_predicate.cpp | 60 | ||||
-rw-r--r-- | src/video_core/shader/decode/texture.cpp | 365 | ||||
-rw-r--r-- | src/video_core/shader/decode/warp.cpp | 82 |
13 files changed, 436 insertions, 222 deletions
diff --git a/src/video_core/shader/decode/arithmetic.cpp b/src/video_core/shader/decode/arithmetic.cpp index 1473c282a..fcedd2af6 100644 --- a/src/video_core/shader/decode/arithmetic.cpp +++ b/src/video_core/shader/decode/arithmetic.cpp @@ -43,12 +43,12 @@ u32 ShaderIR::DecodeArithmetic(NodeBlock& bb, u32 pc) { case OpCode::Id::FMUL_IMM: { // FMUL does not have 'abs' bits and only the second operand has a 'neg' bit. if (instr.fmul.tab5cb8_2 != 0) { - LOG_WARNING(HW_GPU, "FMUL tab5cb8_2({}) is not implemented", - instr.fmul.tab5cb8_2.Value()); + LOG_DEBUG(HW_GPU, "FMUL tab5cb8_2({}) is not implemented", + instr.fmul.tab5cb8_2.Value()); } if (instr.fmul.tab5c68_0 != 1) { - LOG_WARNING(HW_GPU, "FMUL tab5cb8_0({}) is not implemented", - instr.fmul.tab5c68_0.Value()); + LOG_DEBUG(HW_GPU, "FMUL tab5cb8_0({}) is not implemented", + instr.fmul.tab5c68_0.Value()); } op_b = GetOperandAbsNegFloat(op_b, false, instr.fmul.negate_b); @@ -144,10 +144,11 @@ u32 ShaderIR::DecodeArithmetic(NodeBlock& bb, u32 pc) { case OpCode::Id::RRO_C: case OpCode::Id::RRO_R: case OpCode::Id::RRO_IMM: { + LOG_DEBUG(HW_GPU, "(STUBBED) RRO used"); + // Currently RRO is only implemented as a register move. op_b = GetOperandAbsNegFloat(op_b, instr.alu.abs_b, instr.alu.negate_b); SetRegister(bb, instr.gpr0, op_b); - LOG_WARNING(HW_GPU, "RRO instruction is incomplete"); break; } default: diff --git a/src/video_core/shader/decode/arithmetic_half.cpp b/src/video_core/shader/decode/arithmetic_half.cpp index b06cbe441..ee7d9a29d 100644 --- a/src/video_core/shader/decode/arithmetic_half.cpp +++ b/src/video_core/shader/decode/arithmetic_half.cpp @@ -21,8 +21,8 @@ u32 ShaderIR::DecodeArithmeticHalf(NodeBlock& bb, u32 pc) { if (opcode->get().GetId() == OpCode::Id::HADD2_C || opcode->get().GetId() == OpCode::Id::HADD2_R) { - if (instr.alu_half.ftz != 0) { - LOG_WARNING(HW_GPU, "{} FTZ not implemented", opcode->get().GetName()); + if (instr.alu_half.ftz == 0) { + LOG_DEBUG(HW_GPU, "{} without FTZ is not implemented", opcode->get().GetName()); } } diff --git a/src/video_core/shader/decode/arithmetic_half_immediate.cpp b/src/video_core/shader/decode/arithmetic_half_immediate.cpp index 6466fc011..d179b9873 100644 --- a/src/video_core/shader/decode/arithmetic_half_immediate.cpp +++ b/src/video_core/shader/decode/arithmetic_half_immediate.cpp @@ -19,12 +19,12 @@ u32 ShaderIR::DecodeArithmeticHalfImmediate(NodeBlock& bb, u32 pc) { const auto opcode = OpCode::Decode(instr); if (opcode->get().GetId() == OpCode::Id::HADD2_IMM) { - if (instr.alu_half_imm.ftz != 0) { - LOG_WARNING(HW_GPU, "{} FTZ not implemented", opcode->get().GetName()); + if (instr.alu_half_imm.ftz == 0) { + LOG_DEBUG(HW_GPU, "{} without FTZ is not implemented", opcode->get().GetName()); } } else { - if (instr.alu_half_imm.precision != Tegra::Shader::HalfPrecision::None) { - LOG_WARNING(HW_GPU, "{} FTZ not implemented", opcode->get().GetName()); + if (instr.alu_half_imm.precision != Tegra::Shader::HalfPrecision::FTZ) { + LOG_DEBUG(HW_GPU, "{} without FTZ is not implemented", opcode->get().GetName()); } } diff --git a/src/video_core/shader/decode/arithmetic_integer.cpp b/src/video_core/shader/decode/arithmetic_integer.cpp index a33d242e9..371fae127 100644 --- a/src/video_core/shader/decode/arithmetic_integer.cpp +++ b/src/video_core/shader/decode/arithmetic_integer.cpp @@ -130,6 +130,25 @@ u32 ShaderIR::DecodeArithmeticInteger(NodeBlock& bb, u32 pc) { SetRegister(bb, instr.gpr0, value); break; } + case OpCode::Id::FLO_R: + case OpCode::Id::FLO_C: + case OpCode::Id::FLO_IMM: { + Node value; + if (instr.flo.invert) { + op_b = Operation(OperationCode::IBitwiseNot, NO_PRECISE, std::move(op_b)); + } + if (instr.flo.is_signed) { + value = Operation(OperationCode::IBitMSB, NO_PRECISE, std::move(op_b)); + } else { + value = Operation(OperationCode::UBitMSB, NO_PRECISE, std::move(op_b)); + } + if (instr.flo.sh) { + value = + Operation(OperationCode::UBitwiseXor, NO_PRECISE, std::move(value), Immediate(31)); + } + SetRegister(bb, instr.gpr0, std::move(value)); + break; + } case OpCode::Id::SEL_C: case OpCode::Id::SEL_R: case OpCode::Id::SEL_IMM: { diff --git a/src/video_core/shader/decode/conversion.cpp b/src/video_core/shader/decode/conversion.cpp index 32facd6ba..0eeb75559 100644 --- a/src/video_core/shader/decode/conversion.cpp +++ b/src/video_core/shader/decode/conversion.cpp @@ -63,12 +63,11 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) { case OpCode::Id::I2F_R: case OpCode::Id::I2F_C: case OpCode::Id::I2F_IMM: { - UNIMPLEMENTED_IF(instr.conversion.int_src.selector != 0); UNIMPLEMENTED_IF(instr.conversion.dst_size == Register::Size::Long); UNIMPLEMENTED_IF_MSG(instr.generates_cc, "Condition codes generation in I2F is not implemented"); - Node value = [&]() { + Node value = [&] { switch (opcode->get().GetId()) { case OpCode::Id::I2F_R: return GetRegister(instr.gpr20); @@ -81,7 +80,19 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) { return Immediate(0); } }(); + const bool input_signed = instr.conversion.is_input_signed; + + if (instr.conversion.src_size == Register::Size::Byte) { + const u32 offset = static_cast<u32>(instr.conversion.int_src.selector) * 8; + if (offset > 0) { + value = SignedOperation(OperationCode::ILogicalShiftRight, input_signed, + std::move(value), Immediate(offset)); + } + } else { + UNIMPLEMENTED_IF(instr.conversion.int_src.selector != 0); + } + value = ConvertIntegerSize(value, instr.conversion.src_size, input_signed); value = GetOperandAbsNegInteger(value, instr.conversion.abs_a, false, input_signed); value = SignedOperation(OperationCode::FCastInteger, input_signed, PRECISE, value); diff --git a/src/video_core/shader/decode/ffma.cpp b/src/video_core/shader/decode/ffma.cpp index ca2f39e8d..5973588d6 100644 --- a/src/video_core/shader/decode/ffma.cpp +++ b/src/video_core/shader/decode/ffma.cpp @@ -19,10 +19,10 @@ u32 ShaderIR::DecodeFfma(NodeBlock& bb, u32 pc) { UNIMPLEMENTED_IF_MSG(instr.ffma.cc != 0, "FFMA cc not implemented"); if (instr.ffma.tab5980_0 != 1) { - LOG_WARNING(HW_GPU, "FFMA tab5980_0({}) not implemented", instr.ffma.tab5980_0.Value()); + LOG_DEBUG(HW_GPU, "FFMA tab5980_0({}) not implemented", instr.ffma.tab5980_0.Value()); } if (instr.ffma.tab5980_1 != 0) { - LOG_WARNING(HW_GPU, "FFMA tab5980_1({}) not implemented", instr.ffma.tab5980_1.Value()); + LOG_DEBUG(HW_GPU, "FFMA tab5980_1({}) not implemented", instr.ffma.tab5980_1.Value()); } const Node op_a = GetRegister(instr.gpr8); diff --git a/src/video_core/shader/decode/half_set.cpp b/src/video_core/shader/decode/half_set.cpp index 48ca7a4af..848e46874 100644 --- a/src/video_core/shader/decode/half_set.cpp +++ b/src/video_core/shader/decode/half_set.cpp @@ -20,8 +20,8 @@ u32 ShaderIR::DecodeHalfSet(NodeBlock& bb, u32 pc) { const Instruction instr = {program_code[pc]}; const auto opcode = OpCode::Decode(instr); - if (instr.hset2.ftz != 0) { - LOG_WARNING(HW_GPU, "{} FTZ not implemented", opcode->get().GetName()); + if (instr.hset2.ftz == 0) { + LOG_DEBUG(HW_GPU, "{} without FTZ is not implemented", opcode->get().GetName()); } Node op_a = UnpackHalfFloat(GetRegister(instr.gpr8), instr.hset2.type_a); diff --git a/src/video_core/shader/decode/half_set_predicate.cpp b/src/video_core/shader/decode/half_set_predicate.cpp index fec8f2dbe..310655619 100644 --- a/src/video_core/shader/decode/half_set_predicate.cpp +++ b/src/video_core/shader/decode/half_set_predicate.cpp @@ -19,7 +19,9 @@ u32 ShaderIR::DecodeHalfSetPredicate(NodeBlock& bb, u32 pc) { const Instruction instr = {program_code[pc]}; const auto opcode = OpCode::Decode(instr); - LOG_DEBUG(HW_GPU, "ftz={}", static_cast<u32>(instr.hsetp2.ftz)); + if (instr.hsetp2.ftz != 0) { + LOG_DEBUG(HW_GPU, "{} without FTZ is not implemented", opcode->get().GetName()); + } Node op_a = UnpackHalfFloat(GetRegister(instr.gpr8), instr.hsetp2.type_a); op_a = GetOperandAbsNegHalf(op_a, instr.hsetp2.abs_a, instr.hsetp2.negate_a); diff --git a/src/video_core/shader/decode/memory.cpp b/src/video_core/shader/decode/memory.cpp index 335d78146..c934d0719 100644 --- a/src/video_core/shader/decode/memory.cpp +++ b/src/video_core/shader/decode/memory.cpp @@ -21,8 +21,10 @@ using Tegra::Shader::OpCode; using Tegra::Shader::Register; namespace { -u32 GetUniformTypeElementsCount(Tegra::Shader::UniformType uniform_type) { + +u32 GetLdgMemorySize(Tegra::Shader::UniformType uniform_type) { switch (uniform_type) { + case Tegra::Shader::UniformType::UnsignedByte: case Tegra::Shader::UniformType::Single: return 1; case Tegra::Shader::UniformType::Double: @@ -35,6 +37,22 @@ u32 GetUniformTypeElementsCount(Tegra::Shader::UniformType uniform_type) { return 1; } } + +u32 GetStgMemorySize(Tegra::Shader::UniformType uniform_type) { + switch (uniform_type) { + case Tegra::Shader::UniformType::Single: + return 1; + case Tegra::Shader::UniformType::Double: + return 2; + case Tegra::Shader::UniformType::Quad: + case Tegra::Shader::UniformType::UnsignedQuad: + return 4; + default: + UNIMPLEMENTED_MSG("Unimplemented size={}!", static_cast<u32>(uniform_type)); + return 1; + } +} + } // Anonymous namespace u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) { @@ -168,7 +186,7 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) { const auto [real_address_base, base_address, descriptor] = TrackGlobalMemory(bb, instr, false); - const u32 count = GetUniformTypeElementsCount(type); + const u32 count = GetLdgMemorySize(type); if (!real_address_base || !base_address) { // Tracking failed, load zeroes. for (u32 i = 0; i < count; ++i) { @@ -179,12 +197,22 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) { for (u32 i = 0; i < count; ++i) { const Node it_offset = Immediate(i * 4); - const Node real_address = - Operation(OperationCode::UAdd, NO_PRECISE, real_address_base, it_offset); - const Node gmem = MakeNode<GmemNode>(real_address, base_address, descriptor); + const Node real_address = Operation(OperationCode::UAdd, real_address_base, it_offset); + Node gmem = MakeNode<GmemNode>(real_address, base_address, descriptor); + + if (type == Tegra::Shader::UniformType::UnsignedByte) { + // To handle unaligned loads get the byte used to dereferenced global memory + // and extract that byte from the loaded uint32. + Node byte = Operation(OperationCode::UBitwiseAnd, real_address, Immediate(3)); + byte = Operation(OperationCode::ULogicalShiftLeft, std::move(byte), Immediate(3)); + + gmem = Operation(OperationCode::UBitfieldExtract, std::move(gmem), std::move(byte), + Immediate(8)); + } SetTemporary(bb, i, gmem); } + for (u32 i = 0; i < count; ++i) { SetRegister(bb, instr.gpr0.Value() + i, GetTemporary(i)); } @@ -196,28 +224,28 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) { UNIMPLEMENTED_IF_MSG((instr.attribute.fmt20.immediate.Value() % sizeof(u32)) != 0, "Unaligned attribute loads are not supported"); - u64 next_element = instr.attribute.fmt20.element; - auto next_index = static_cast<u64>(instr.attribute.fmt20.index.Value()); + u64 element = instr.attribute.fmt20.element; + auto index = static_cast<u64>(instr.attribute.fmt20.index.Value()); - const auto StoreNextElement = [&](u32 reg_offset) { - const auto dest = GetOutputAttribute(static_cast<Attribute::Index>(next_index), - next_element, GetRegister(instr.gpr39)); + const u32 num_words = static_cast<u32>(instr.attribute.fmt20.size.Value()) + 1; + for (u32 reg_offset = 0; reg_offset < num_words; ++reg_offset) { + Node dest; + if (instr.attribute.fmt20.patch) { + const u32 offset = static_cast<u32>(index) * 4 + static_cast<u32>(element); + dest = MakeNode<PatchNode>(offset); + } else { + dest = GetOutputAttribute(static_cast<Attribute::Index>(index), element, + GetRegister(instr.gpr39)); + } const auto src = GetRegister(instr.gpr0.Value() + reg_offset); bb.push_back(Operation(OperationCode::Assign, dest, src)); - // Load the next attribute element into the following register. If the element - // to load goes beyond the vec4 size, load the first element of the next - // attribute. - next_element = (next_element + 1) % 4; - next_index = next_index + (next_element == 0 ? 1 : 0); - }; - - const u32 num_words = static_cast<u32>(instr.attribute.fmt20.size.Value()) + 1; - for (u32 reg_offset = 0; reg_offset < num_words; ++reg_offset) { - StoreNextElement(reg_offset); + // Load the next attribute element into the following register. If the element to load + // goes beyond the vec4 size, load the first element of the next attribute. + element = (element + 1) % 4; + index = index + (element == 0 ? 1 : 0); } - break; } case OpCode::Id::ST_L: @@ -274,7 +302,7 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) { break; } - const u32 count = GetUniformTypeElementsCount(type); + const u32 count = GetStgMemorySize(type); for (u32 i = 0; i < count; ++i) { const Node it_offset = Immediate(i * 4); const Node real_address = Operation(OperationCode::UAdd, real_address_base, it_offset); diff --git a/src/video_core/shader/decode/other.cpp b/src/video_core/shader/decode/other.cpp index 116b95f76..7321698b2 100644 --- a/src/video_core/shader/decode/other.cpp +++ b/src/video_core/shader/decode/other.cpp @@ -69,6 +69,8 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) { case OpCode::Id::MOV_SYS: { const Node value = [this, instr] { switch (instr.sys20) { + case SystemVariable::InvocationId: + return Operation(OperationCode::InvocationId); case SystemVariable::Ydirection: return Operation(OperationCode::YNegate); case SystemVariable::InvocationInfo: @@ -255,8 +257,14 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) { SetRegister(bb, instr.gpr0, GetRegister(instr.gpr8)); break; } + case OpCode::Id::MEMBAR: { + UNIMPLEMENTED_IF(instr.membar.type != Tegra::Shader::MembarType::GL); + UNIMPLEMENTED_IF(instr.membar.unknown != Tegra::Shader::MembarUnknown::Default); + bb.push_back(Operation(OperationCode::MemoryBarrierGL)); + break; + } case OpCode::Id::DEPBAR: { - LOG_WARNING(HW_GPU, "DEPBAR instruction is stubbed"); + LOG_DEBUG(HW_GPU, "DEPBAR instruction is stubbed"); break; } default: diff --git a/src/video_core/shader/decode/register_set_predicate.cpp b/src/video_core/shader/decode/register_set_predicate.cpp index e6c9d287e..8d54cce34 100644 --- a/src/video_core/shader/decode/register_set_predicate.cpp +++ b/src/video_core/shader/decode/register_set_predicate.cpp @@ -13,37 +13,65 @@ namespace VideoCommon::Shader { using Tegra::Shader::Instruction; using Tegra::Shader::OpCode; +namespace { +constexpr u64 NUM_PROGRAMMABLE_PREDICATES = 7; +} + u32 ShaderIR::DecodeRegisterSetPredicate(NodeBlock& bb, u32 pc) { const Instruction instr = {program_code[pc]}; const auto opcode = OpCode::Decode(instr); - UNIMPLEMENTED_IF(instr.r2p.mode != Tegra::Shader::R2pMode::Pr); + UNIMPLEMENTED_IF(instr.p2r_r2p.mode != Tegra::Shader::R2pMode::Pr); - const Node apply_mask = [&]() { + const Node apply_mask = [&] { switch (opcode->get().GetId()) { case OpCode::Id::R2P_IMM: - return Immediate(static_cast<u32>(instr.r2p.immediate_mask)); + case OpCode::Id::P2R_IMM: + return Immediate(static_cast<u32>(instr.p2r_r2p.immediate_mask)); default: UNREACHABLE(); - return Immediate(static_cast<u32>(instr.r2p.immediate_mask)); + return Immediate(0); } }(); - const Node mask = GetRegister(instr.gpr8); - const auto offset = static_cast<u32>(instr.r2p.byte) * 8; - constexpr u32 programmable_preds = 7; - for (u64 pred = 0; pred < programmable_preds; ++pred) { - const auto shift = static_cast<u32>(pred); + const auto offset = static_cast<u32>(instr.p2r_r2p.byte) * 8; + + switch (opcode->get().GetId()) { + case OpCode::Id::R2P_IMM: { + const Node mask = GetRegister(instr.gpr8); - const Node apply_compare = BitfieldExtract(apply_mask, shift, 1); - const Node condition = - Operation(OperationCode::LogicalUNotEqual, apply_compare, Immediate(0)); + for (u64 pred = 0; pred < NUM_PROGRAMMABLE_PREDICATES; ++pred) { + const auto shift = static_cast<u32>(pred); - const Node value_compare = BitfieldExtract(mask, offset + shift, 1); - const Node value = Operation(OperationCode::LogicalUNotEqual, value_compare, Immediate(0)); + const Node apply_compare = BitfieldExtract(apply_mask, shift, 1); + const Node condition = + Operation(OperationCode::LogicalUNotEqual, apply_compare, Immediate(0)); - const Node code = Operation(OperationCode::LogicalAssign, GetPredicate(pred), value); - bb.push_back(Conditional(condition, {code})); + const Node value_compare = BitfieldExtract(mask, offset + shift, 1); + const Node value = + Operation(OperationCode::LogicalUNotEqual, value_compare, Immediate(0)); + + const Node code = Operation(OperationCode::LogicalAssign, GetPredicate(pred), value); + bb.push_back(Conditional(condition, {code})); + } + break; + } + case OpCode::Id::P2R_IMM: { + Node value = Immediate(0); + for (u64 pred = 0; pred < NUM_PROGRAMMABLE_PREDICATES; ++pred) { + Node bit = Operation(OperationCode::Select, GetPredicate(pred), Immediate(1U << pred), + Immediate(0)); + value = Operation(OperationCode::UBitwiseOr, std::move(value), std::move(bit)); + } + value = Operation(OperationCode::UBitwiseAnd, std::move(value), apply_mask); + value = BitfieldInsert(GetRegister(instr.gpr8), std::move(value), offset, 8); + + SetRegister(bb, instr.gpr0, std::move(value)); + break; + } + default: + UNIMPLEMENTED_MSG("Unhandled P2R/R2R instruction: {}", opcode->get().GetName()); + break; } return pc; diff --git a/src/video_core/shader/decode/texture.cpp b/src/video_core/shader/decode/texture.cpp index ca690b58b..4b14cdf58 100644 --- a/src/video_core/shader/decode/texture.cpp +++ b/src/video_core/shader/decode/texture.cpp @@ -44,10 +44,6 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) { bool is_bindless = false; switch (opcode->get().GetId()) { case OpCode::Id::TEX: { - if (instr.tex.UsesMiscMode(TextureMiscMode::NODEP)) { - LOG_WARNING(HW_GPU, "TEX.NODEP implementation is incomplete"); - } - const TextureType texture_type{instr.tex.texture_type}; const bool is_array = instr.tex.array != 0; const bool is_aoffi = instr.tex.UsesMiscMode(TextureMiscMode::AOFFI); @@ -62,10 +58,6 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) { UNIMPLEMENTED_IF_MSG(instr.tex.UsesMiscMode(TextureMiscMode::AOFFI), "AOFFI is not implemented"); - if (instr.tex.UsesMiscMode(TextureMiscMode::NODEP)) { - LOG_WARNING(HW_GPU, "TEX.NODEP implementation is incomplete"); - } - const TextureType texture_type{instr.tex_b.texture_type}; const bool is_array = instr.tex_b.array != 0; const bool is_aoffi = instr.tex.UsesMiscMode(TextureMiscMode::AOFFI); @@ -82,10 +74,6 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) { const bool depth_compare = instr.texs.UsesMiscMode(TextureMiscMode::DC); const auto process_mode = instr.texs.GetTextureProcessMode(); - if (instr.texs.UsesMiscMode(TextureMiscMode::NODEP)) { - LOG_WARNING(HW_GPU, "TEXS.NODEP implementation is incomplete"); - } - const Node4 components = GetTexsCode(instr, texture_type, process_mode, depth_compare, is_array); @@ -101,78 +89,142 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) { [[fallthrough]]; } case OpCode::Id::TLD4: { - ASSERT(instr.tld4.array == 0); UNIMPLEMENTED_IF_MSG(instr.tld4.UsesMiscMode(TextureMiscMode::NDV), "NDV is not implemented"); - UNIMPLEMENTED_IF_MSG(instr.tld4.UsesMiscMode(TextureMiscMode::PTP), - "PTP is not implemented"); - - if (instr.tld4.UsesMiscMode(TextureMiscMode::NODEP)) { - LOG_WARNING(HW_GPU, "TLD4.NODEP implementation is incomplete"); - } - const auto texture_type = instr.tld4.texture_type.Value(); const bool depth_compare = is_bindless ? instr.tld4_b.UsesMiscMode(TextureMiscMode::DC) : instr.tld4.UsesMiscMode(TextureMiscMode::DC); const bool is_array = instr.tld4.array != 0; const bool is_aoffi = is_bindless ? instr.tld4_b.UsesMiscMode(TextureMiscMode::AOFFI) : instr.tld4.UsesMiscMode(TextureMiscMode::AOFFI); - WriteTexInstructionFloat( - bb, instr, - GetTld4Code(instr, texture_type, depth_compare, is_array, is_aoffi, is_bindless)); + const bool is_ptp = is_bindless ? instr.tld4_b.UsesMiscMode(TextureMiscMode::PTP) + : instr.tld4.UsesMiscMode(TextureMiscMode::PTP); + WriteTexInstructionFloat(bb, instr, + GetTld4Code(instr, texture_type, depth_compare, is_array, is_aoffi, + is_ptp, is_bindless)); break; } case OpCode::Id::TLD4S: { - UNIMPLEMENTED_IF_MSG(instr.tld4s.UsesMiscMode(TextureMiscMode::AOFFI), - "AOFFI is not implemented"); - if (instr.tld4s.UsesMiscMode(TextureMiscMode::NODEP)) { - LOG_WARNING(HW_GPU, "TLD4S.NODEP implementation is incomplete"); - } - - const bool depth_compare = instr.tld4s.UsesMiscMode(TextureMiscMode::DC); + constexpr std::size_t num_coords = 2; + const bool is_aoffi = instr.tld4s.UsesMiscMode(TextureMiscMode::AOFFI); + const bool is_depth_compare = instr.tld4s.UsesMiscMode(TextureMiscMode::DC); const Node op_a = GetRegister(instr.gpr8); const Node op_b = GetRegister(instr.gpr20); // TODO(Subv): Figure out how the sampler type is encoded in the TLD4S instruction. std::vector<Node> coords; - if (depth_compare) { + std::vector<Node> aoffi; + Node depth_compare; + if (is_depth_compare) { // Note: TLD4S coordinate encoding works just like TEXS's const Node op_y = GetRegister(instr.gpr8.Value() + 1); coords.push_back(op_a); coords.push_back(op_y); - coords.push_back(op_b); + if (is_aoffi) { + aoffi = GetAoffiCoordinates(op_b, num_coords, true); + depth_compare = GetRegister(instr.gpr20.Value() + 1); + } else { + depth_compare = op_b; + } } else { + // There's no depth compare coords.push_back(op_a); - coords.push_back(op_b); + if (is_aoffi) { + coords.push_back(GetRegister(instr.gpr8.Value() + 1)); + aoffi = GetAoffiCoordinates(op_b, num_coords, true); + } else { + coords.push_back(op_b); + } } const Node component = Immediate(static_cast<u32>(instr.tld4s.component)); - const auto& sampler = - GetSampler(instr.sampler, {{TextureType::Texture2D, false, depth_compare}}); + const SamplerInfo info{TextureType::Texture2D, false, is_depth_compare}; + const Sampler& sampler = *GetSampler(instr.sampler, info); Node4 values; for (u32 element = 0; element < values.size(); ++element) { auto coords_copy = coords; - MetaTexture meta{sampler, {}, {}, {}, {}, {}, component, element}; + MetaTexture meta{sampler, {}, depth_compare, aoffi, {}, {}, {}, {}, component, element}; values[element] = Operation(OperationCode::TextureGather, meta, std::move(coords_copy)); } - WriteTexsInstructionFloat(bb, instr, values, true); + if (instr.tld4s.fp16_flag) { + WriteTexsInstructionHalfFloat(bb, instr, values, true); + } else { + WriteTexsInstructionFloat(bb, instr, values, true); + } break; } - case OpCode::Id::TXQ_B: + case OpCode::Id::TXD_B: is_bindless = true; [[fallthrough]]; - case OpCode::Id::TXQ: { - if (instr.txq.UsesMiscMode(TextureMiscMode::NODEP)) { - LOG_WARNING(HW_GPU, "TXQ.NODEP implementation is incomplete"); + case OpCode::Id::TXD: { + UNIMPLEMENTED_IF_MSG(instr.txd.UsesMiscMode(TextureMiscMode::AOFFI), + "AOFFI is not implemented"); + UNIMPLEMENTED_IF_MSG(instr.txd.is_array != 0, "TXD Array is not implemented"); + + u64 base_reg = instr.gpr8.Value(); + const auto derivate_reg = instr.gpr20.Value(); + const auto texture_type = instr.txd.texture_type.Value(); + const auto coord_count = GetCoordCount(texture_type); + + const Sampler* sampler = is_bindless + ? GetBindlessSampler(base_reg, {{texture_type, false, false}}) + : GetSampler(instr.sampler, {{texture_type, false, false}}); + Node4 values; + if (sampler == nullptr) { + for (u32 element = 0; element < values.size(); ++element) { + values[element] = Immediate(0); + } + WriteTexInstructionFloat(bb, instr, values); + break; + } + if (is_bindless) { + base_reg++; } + std::vector<Node> coords; + std::vector<Node> derivates; + for (std::size_t i = 0; i < coord_count; ++i) { + coords.push_back(GetRegister(base_reg + i)); + const std::size_t derivate = i * 2; + derivates.push_back(GetRegister(derivate_reg + derivate)); + derivates.push_back(GetRegister(derivate_reg + derivate + 1)); + } + + for (u32 element = 0; element < values.size(); ++element) { + MetaTexture meta{*sampler, {}, {}, {}, {}, derivates, {}, {}, {}, element}; + values[element] = Operation(OperationCode::TextureGradient, std::move(meta), coords); + } + + WriteTexInstructionFloat(bb, instr, values); + + break; + } + case OpCode::Id::TXQ_B: + is_bindless = true; + [[fallthrough]]; + case OpCode::Id::TXQ: { // TODO: The new commits on the texture refactor, change the way samplers work. // Sadly, not all texture instructions specify the type of texture their sampler // uses. This must be fixed at a later instance. - const auto& sampler = - is_bindless ? GetBindlessSampler(instr.gpr8, {}) : GetSampler(instr.sampler, {}); + const Sampler* sampler = + is_bindless ? GetBindlessSampler(instr.gpr8) : GetSampler(instr.sampler); + + if (sampler == nullptr) { + u32 indexer = 0; + for (u32 element = 0; element < 4; ++element) { + if (!instr.txq.IsComponentEnabled(element)) { + continue; + } + const Node value = Immediate(0); + SetTemporary(bb, indexer++, value); + } + for (u32 i = 0; i < indexer; ++i) { + SetRegister(bb, instr.gpr0.Value() + i, GetTemporary(i)); + } + break; + } u32 indexer = 0; switch (instr.txq.query_type) { @@ -181,7 +233,7 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) { if (!instr.txq.IsComponentEnabled(element)) { continue; } - MetaTexture meta{sampler, {}, {}, {}, {}, {}, {}, element}; + MetaTexture meta{*sampler, {}, {}, {}, {}, {}, {}, {}, {}, element}; const Node value = Operation(OperationCode::TextureQueryDimensions, meta, GetRegister(instr.gpr8.Value() + (is_bindless ? 1 : 0))); @@ -205,15 +257,25 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) { UNIMPLEMENTED_IF_MSG(instr.tmml.UsesMiscMode(Tegra::Shader::TextureMiscMode::NDV), "NDV is not implemented"); - if (instr.tmml.UsesMiscMode(TextureMiscMode::NODEP)) { - LOG_WARNING(HW_GPU, "TMML.NODEP implementation is incomplete"); - } - auto texture_type = instr.tmml.texture_type.Value(); const bool is_array = instr.tmml.array != 0; - const auto& sampler = - is_bindless ? GetBindlessSampler(instr.gpr20, {{texture_type, is_array, false}}) - : GetSampler(instr.sampler, {{texture_type, is_array, false}}); + const Sampler* sampler = + is_bindless ? GetBindlessSampler(instr.gpr20) : GetSampler(instr.sampler); + + if (sampler == nullptr) { + u32 indexer = 0; + for (u32 element = 0; element < 2; ++element) { + if (!instr.tmml.IsComponentEnabled(element)) { + continue; + } + const Node value = Immediate(0); + SetTemporary(bb, indexer++, value); + } + for (u32 i = 0; i < indexer; ++i) { + SetRegister(bb, instr.gpr0.Value() + i, GetTemporary(i)); + } + break; + } std::vector<Node> coords; @@ -240,7 +302,7 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) { continue; } auto params = coords; - MetaTexture meta{sampler, {}, {}, {}, {}, {}, {}, element}; + MetaTexture meta{*sampler, {}, {}, {}, {}, {}, {}, {}, {}, element}; const Node value = Operation(OperationCode::TextureQueryLod, meta, std::move(params)); SetTemporary(bb, indexer++, value); } @@ -254,10 +316,6 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) { UNIMPLEMENTED_IF_MSG(instr.tld.ms, "MS is not implemented"); UNIMPLEMENTED_IF_MSG(instr.tld.cl, "CL is not implemented"); - if (instr.tld.nodep_flag) { - LOG_WARNING(HW_GPU, "TLD.NODEP implementation is incomplete"); - } - WriteTexInstructionFloat(bb, instr, GetTldCode(instr)); break; } @@ -269,10 +327,6 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) { "AOFFI is not implemented"); UNIMPLEMENTED_IF_MSG(instr.tlds.UsesMiscMode(TextureMiscMode::MZ), "MZ is not implemented"); - if (instr.tlds.UsesMiscMode(TextureMiscMode::NODEP)) { - LOG_WARNING(HW_GPU, "TLDS.NODEP implementation is incomplete"); - } - const Node4 components = GetTldsCode(instr, texture_type, is_array); if (instr.tlds.fp32_flag) { @@ -289,68 +343,54 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) { return pc; } -const Sampler& ShaderIR::GetSampler(const Tegra::Shader::Sampler& sampler, - std::optional<SamplerInfo> sampler_info) { - const auto offset = static_cast<u32>(sampler.index.Value()); - - TextureType type; - bool is_array; - bool is_shadow; +ShaderIR::SamplerInfo ShaderIR::GetSamplerInfo(std::optional<SamplerInfo> sampler_info, u32 offset, + std::optional<u32> buffer) { if (sampler_info) { - type = sampler_info->type; - is_array = sampler_info->is_array; - is_shadow = sampler_info->is_shadow; - } else if (const auto sampler = locker.ObtainBoundSampler(offset)) { - type = sampler->texture_type.Value(); - is_array = sampler->is_array.Value() != 0; - is_shadow = sampler->is_shadow.Value() != 0; - } else { + return *sampler_info; + } + const auto sampler = + buffer ? locker.ObtainBindlessSampler(*buffer, offset) : locker.ObtainBoundSampler(offset); + if (!sampler) { LOG_WARNING(HW_GPU, "Unknown sampler info"); - type = TextureType::Texture2D; - is_array = false; - is_shadow = false; + return SamplerInfo{TextureType::Texture2D, false, false, false}; } + return SamplerInfo{sampler->texture_type, sampler->is_array != 0, sampler->is_shadow != 0, + sampler->is_buffer != 0}; +} + +const Sampler* ShaderIR::GetSampler(const Tegra::Shader::Sampler& sampler, + std::optional<SamplerInfo> sampler_info) { + const auto offset = static_cast<u32>(sampler.index.Value()); + const auto info = GetSamplerInfo(sampler_info, offset); // If this sampler has already been used, return the existing mapping. const auto it = std::find_if(used_samplers.begin(), used_samplers.end(), [offset](const Sampler& entry) { return entry.GetOffset() == offset; }); if (it != used_samplers.end()) { - ASSERT(!it->IsBindless() && it->GetType() == type && it->IsArray() == is_array && - it->IsShadow() == is_shadow); - return *it; + ASSERT(!it->IsBindless() && it->GetType() == info.type && it->IsArray() == info.is_array && + it->IsShadow() == info.is_shadow && it->IsBuffer() == info.is_buffer); + return &*it; } // Otherwise create a new mapping for this sampler const auto next_index = static_cast<u32>(used_samplers.size()); - return used_samplers.emplace_back(Sampler(next_index, offset, type, is_array, is_shadow)); + return &used_samplers.emplace_back(next_index, offset, info.type, info.is_array, info.is_shadow, + info.is_buffer); } -const Sampler& ShaderIR::GetBindlessSampler(const Tegra::Shader::Register& reg, +const Sampler* ShaderIR::GetBindlessSampler(Tegra::Shader::Register reg, std::optional<SamplerInfo> sampler_info) { const Node sampler_register = GetRegister(reg); const auto [base_sampler, buffer, offset] = TrackCbuf(sampler_register, global_code, static_cast<s64>(global_code.size())); ASSERT(base_sampler != nullptr); - - TextureType type; - bool is_array; - bool is_shadow; - if (sampler_info) { - type = sampler_info->type; - is_array = sampler_info->is_array; - is_shadow = sampler_info->is_shadow; - } else if (const auto sampler = locker.ObtainBindlessSampler(buffer, offset)) { - type = sampler->texture_type.Value(); - is_array = sampler->is_array.Value() != 0; - is_shadow = sampler->is_shadow.Value() != 0; - } else { - LOG_WARNING(HW_GPU, "Unknown sampler info"); - type = TextureType::Texture2D; - is_array = false; - is_shadow = false; + if (base_sampler == nullptr) { + return nullptr; } + const auto info = GetSamplerInfo(sampler_info, offset, buffer); + // If this sampler has already been used, return the existing mapping. const auto it = std::find_if(used_samplers.begin(), used_samplers.end(), @@ -358,15 +398,15 @@ const Sampler& ShaderIR::GetBindlessSampler(const Tegra::Shader::Register& reg, return entry.GetBuffer() == buffer && entry.GetOffset() == offset; }); if (it != used_samplers.end()) { - ASSERT(it->IsBindless() && it->GetType() == type && it->IsArray() == is_array && - it->IsShadow() == is_shadow); - return *it; + ASSERT(it->IsBindless() && it->GetType() == info.type && it->IsArray() == info.is_array && + it->IsShadow() == info.is_shadow); + return &*it; } // Otherwise create a new mapping for this sampler const auto next_index = static_cast<u32>(used_samplers.size()); - return used_samplers.emplace_back( - Sampler(next_index, offset, buffer, type, is_array, is_shadow)); + return &used_samplers.emplace_back(next_index, offset, buffer, info.type, info.is_array, + info.is_shadow, info.is_buffer); } void ShaderIR::WriteTexInstructionFloat(NodeBlock& bb, Instruction instr, const Node4& components) { @@ -409,14 +449,14 @@ void ShaderIR::WriteTexsInstructionFloat(NodeBlock& bb, Instruction instr, const } void ShaderIR::WriteTexsInstructionHalfFloat(NodeBlock& bb, Instruction instr, - const Node4& components) { + const Node4& components, bool ignore_mask) { // TEXS.F16 destionation registers are packed in two registers in pairs (just like any half // float instruction). Node4 values; u32 dest_elem = 0; for (u32 component = 0; component < 4; ++component) { - if (!instr.texs.IsComponentEnabled(component)) + if (!instr.texs.IsComponentEnabled(component) && !ignore_mask) continue; values[dest_elem++] = components[component]; } @@ -451,17 +491,23 @@ Node4 ShaderIR::GetTextureCode(Instruction instr, TextureType texture_type, (texture_type == TextureType::TextureCube && is_array && is_shadow), "This method is not supported."); - const auto& sampler = - is_bindless ? GetBindlessSampler(*bindless_reg, {{texture_type, is_array, is_shadow}}) - : GetSampler(instr.sampler, {{texture_type, is_array, is_shadow}}); + const SamplerInfo info{texture_type, is_array, is_shadow, false}; + const Sampler* sampler = + is_bindless ? GetBindlessSampler(*bindless_reg, info) : GetSampler(instr.sampler, info); + Node4 values; + if (sampler == nullptr) { + for (u32 element = 0; element < values.size(); ++element) { + values[element] = Immediate(0); + } + return values; + } const bool lod_needed = process_mode == TextureProcessMode::LZ || process_mode == TextureProcessMode::LL || process_mode == TextureProcessMode::LLA; - // LOD selection (either via bias or explicit textureLod) not - // supported in GL for sampler2DArrayShadow and - // samplerCubeArrayShadow. + // LOD selection (either via bias or explicit textureLod) not supported in GL for + // sampler2DArrayShadow and samplerCubeArrayShadow. const bool gl_lod_supported = !((texture_type == Tegra::Shader::TextureType::Texture2D && is_array && is_shadow) || (texture_type == Tegra::Shader::TextureType::TextureCube && is_array && is_shadow)); @@ -471,8 +517,8 @@ Node4 ShaderIR::GetTextureCode(Instruction instr, TextureType texture_type, UNIMPLEMENTED_IF(process_mode != TextureProcessMode::None && !gl_lod_supported); - Node bias = {}; - Node lod = {}; + Node bias; + Node lod; if (process_mode != TextureProcessMode::None && gl_lod_supported) { switch (process_mode) { case TextureProcessMode::LZ: @@ -493,10 +539,9 @@ Node4 ShaderIR::GetTextureCode(Instruction instr, TextureType texture_type, } } - Node4 values; for (u32 element = 0; element < values.size(); ++element) { auto copy_coords = coords; - MetaTexture meta{sampler, array, depth_compare, aoffi, bias, lod, {}, element}; + MetaTexture meta{*sampler, array, depth_compare, aoffi, {}, {}, bias, lod, {}, element}; values[element] = Operation(read_method, meta, std::move(copy_coords)); } @@ -593,7 +638,9 @@ Node4 ShaderIR::GetTexsCode(Instruction instr, TextureType texture_type, } Node4 ShaderIR::GetTld4Code(Instruction instr, TextureType texture_type, bool depth_compare, - bool is_array, bool is_aoffi, bool is_bindless) { + bool is_array, bool is_aoffi, bool is_ptp, bool is_bindless) { + ASSERT_MSG(!(is_aoffi && is_ptp), "AOFFI and PTP can't be enabled at the same time"); + const std::size_t coord_count = GetCoordCount(texture_type); // If enabled arrays index is always stored in the gpr8 field @@ -608,17 +655,26 @@ Node4 ShaderIR::GetTld4Code(Instruction instr, TextureType texture_type, bool de u64 parameter_register = instr.gpr20.Value(); - const auto& sampler = - is_bindless - ? GetBindlessSampler(parameter_register++, {{texture_type, is_array, depth_compare}}) - : GetSampler(instr.sampler, {{texture_type, is_array, depth_compare}}); + const SamplerInfo info{texture_type, is_array, depth_compare, false}; + const Sampler* sampler = is_bindless ? GetBindlessSampler(parameter_register++, info) + : GetSampler(instr.sampler, info); + Node4 values; + if (sampler == nullptr) { + for (u32 element = 0; element < values.size(); ++element) { + values[element] = Immediate(0); + } + return values; + } - std::vector<Node> aoffi; + std::vector<Node> aoffi, ptp; if (is_aoffi) { aoffi = GetAoffiCoordinates(GetRegister(parameter_register++), coord_count, true); + } else if (is_ptp) { + ptp = GetPtpCoordinates( + {GetRegister(parameter_register++), GetRegister(parameter_register++)}); } - Node dc{}; + Node dc; if (depth_compare) { dc = GetRegister(parameter_register++); } @@ -626,11 +682,10 @@ Node4 ShaderIR::GetTld4Code(Instruction instr, TextureType texture_type, bool de const Node component = is_bindless ? Immediate(static_cast<u32>(instr.tld4_b.component)) : Immediate(static_cast<u32>(instr.tld4.component)); - Node4 values; for (u32 element = 0; element < values.size(); ++element) { auto coords_copy = coords; - MetaTexture meta{sampler, GetRegister(array_register), dc, aoffi, {}, {}, component, - element}; + MetaTexture meta{ + *sampler, GetRegister(array_register), dc, aoffi, ptp, {}, {}, {}, component, element}; values[element] = Operation(OperationCode::TextureGather, meta, std::move(coords_copy)); } @@ -658,12 +713,12 @@ Node4 ShaderIR::GetTldCode(Tegra::Shader::Instruction instr) { // const Node aoffi_register{is_aoffi ? GetRegister(gpr20_cursor++) : nullptr}; // const Node multisample{is_multisample ? GetRegister(gpr20_cursor++) : nullptr}; - const auto& sampler = GetSampler(instr.sampler, {{texture_type, is_array, false}}); + const auto& sampler = *GetSampler(instr.sampler); Node4 values; for (u32 element = 0; element < values.size(); ++element) { auto coords_copy = coords; - MetaTexture meta{sampler, array_register, {}, {}, {}, lod, {}, element}; + MetaTexture meta{sampler, array_register, {}, {}, {}, {}, {}, lod, {}, element}; values[element] = Operation(OperationCode::TexelFetch, meta, std::move(coords_copy)); } @@ -671,6 +726,8 @@ Node4 ShaderIR::GetTldCode(Tegra::Shader::Instruction instr) { } Node4 ShaderIR::GetTldsCode(Instruction instr, TextureType texture_type, bool is_array) { + const Sampler& sampler = *GetSampler(instr.sampler); + const std::size_t type_coord_count = GetCoordCount(texture_type); const bool lod_enabled = instr.tlds.GetTextureProcessMode() == TextureProcessMode::LL; @@ -694,12 +751,24 @@ Node4 ShaderIR::GetTldsCode(Instruction instr, TextureType texture_type, bool is // When lod is used always is in gpr20 const Node lod = lod_enabled ? GetRegister(instr.gpr20) : Immediate(0); - const auto& sampler = GetSampler(instr.sampler, {{texture_type, is_array, false}}); + // Fill empty entries from the guest sampler + const std::size_t entry_coord_count = GetCoordCount(sampler.GetType()); + if (type_coord_count != entry_coord_count) { + LOG_WARNING(HW_GPU, "Bound and built texture types mismatch"); + + // When the size is higher we insert zeroes + for (std::size_t i = type_coord_count; i < entry_coord_count; ++i) { + coords.push_back(GetRegister(Register::ZeroIndex)); + } + + // Then we ensure the size matches the number of entries (dropping unused values) + coords.resize(entry_coord_count); + } Node4 values; for (u32 element = 0; element < values.size(); ++element) { auto coords_copy = coords; - MetaTexture meta{sampler, array, {}, {}, {}, lod, {}, element}; + MetaTexture meta{sampler, array, {}, {}, {}, {}, {}, lod, {}, element}; values[element] = Operation(OperationCode::TexelFetch, meta, std::move(coords_copy)); } return values; @@ -764,4 +833,38 @@ std::vector<Node> ShaderIR::GetAoffiCoordinates(Node aoffi_reg, std::size_t coor return aoffi; } +std::vector<Node> ShaderIR::GetPtpCoordinates(std::array<Node, 2> ptp_regs) { + static constexpr u32 num_entries = 8; + + std::vector<Node> ptp; + ptp.reserve(num_entries); + + const auto global_size = static_cast<s64>(global_code.size()); + const std::optional low = TrackImmediate(ptp_regs[0], global_code, global_size); + const std::optional high = TrackImmediate(ptp_regs[1], global_code, global_size); + if (!low || !high) { + for (u32 entry = 0; entry < num_entries; ++entry) { + const u32 reg = entry / 4; + const u32 offset = entry % 4; + const Node value = BitfieldExtract(ptp_regs[reg], offset * 8, 6); + const Node condition = + Operation(OperationCode::LogicalIGreaterEqual, value, Immediate(32)); + const Node negative = Operation(OperationCode::IAdd, value, Immediate(-64)); + ptp.push_back(Operation(OperationCode::Select, condition, negative, value)); + } + return ptp; + } + + const u64 immediate = (static_cast<u64>(*high) << 32) | static_cast<u64>(*low); + for (u32 entry = 0; entry < num_entries; ++entry) { + s32 value = (immediate >> (entry * 8)) & 0b111111; + if (value >= 32) { + value -= 64; + } + ptp.push_back(Immediate(value)); + } + + return ptp; +} + } // namespace VideoCommon::Shader diff --git a/src/video_core/shader/decode/warp.cpp b/src/video_core/shader/decode/warp.cpp index fa8a250cc..11b77f795 100644 --- a/src/video_core/shader/decode/warp.cpp +++ b/src/video_core/shader/decode/warp.cpp @@ -17,6 +17,7 @@ using Tegra::Shader::ShuffleOperation; using Tegra::Shader::VoteOperation; namespace { + OperationCode GetOperationCode(VoteOperation vote_op) { switch (vote_op) { case VoteOperation::All: @@ -30,12 +31,16 @@ OperationCode GetOperationCode(VoteOperation vote_op) { return OperationCode::VoteAll; } } + } // Anonymous namespace u32 ShaderIR::DecodeWarp(NodeBlock& bb, u32 pc) { const Instruction instr = {program_code[pc]}; const auto opcode = OpCode::Decode(instr); + // Signal the backend that this shader uses warp instructions. + uses_warps = true; + switch (opcode->get().GetId()) { case OpCode::Id::VOTE: { const Node value = GetPredicate(instr.vote.value, instr.vote.negate_value != 0); @@ -46,50 +51,59 @@ u32 ShaderIR::DecodeWarp(NodeBlock& bb, u32 pc) { break; } case OpCode::Id::SHFL: { - Node width = [this, instr] { - Node mask = instr.shfl.is_mask_imm ? Immediate(static_cast<u32>(instr.shfl.mask_imm)) - : GetRegister(instr.gpr39); - - // Convert the obscure SHFL mask back into GL_NV_shader_thread_shuffle's width. This has - // been done reversing Nvidia's math. It won't work on all cases due to SHFL having - // different parameters that don't properly map to GLSL's interface, but it should work - // for cases emitted by Nvidia's compiler. - if (instr.shfl.operation == ShuffleOperation::Up) { - return Operation( - OperationCode::ILogicalShiftRight, - Operation(OperationCode::IAdd, std::move(mask), Immediate(-0x2000)), - Immediate(8)); - } else { - return Operation(OperationCode::ILogicalShiftRight, - Operation(OperationCode::IAdd, Immediate(0x201F), - Operation(OperationCode::INegate, std::move(mask))), - Immediate(8)); - } - }(); + Node mask = instr.shfl.is_mask_imm ? Immediate(static_cast<u32>(instr.shfl.mask_imm)) + : GetRegister(instr.gpr39); + Node index = instr.shfl.is_index_imm ? Immediate(static_cast<u32>(instr.shfl.index_imm)) + : GetRegister(instr.gpr20); - const auto [operation, in_range] = [instr]() -> std::pair<OperationCode, OperationCode> { + Node thread_id = Operation(OperationCode::ThreadId); + Node clamp = Operation(OperationCode::IBitwiseAnd, mask, Immediate(0x1FU)); + Node seg_mask = BitfieldExtract(mask, 8, 16); + + Node neg_seg_mask = Operation(OperationCode::IBitwiseNot, seg_mask); + Node min_thread_id = Operation(OperationCode::IBitwiseAnd, thread_id, seg_mask); + Node max_thread_id = Operation(OperationCode::IBitwiseOr, min_thread_id, + Operation(OperationCode::IBitwiseAnd, clamp, neg_seg_mask)); + + Node src_thread_id = [instr, index, neg_seg_mask, min_thread_id, thread_id] { switch (instr.shfl.operation) { case ShuffleOperation::Idx: - return {OperationCode::ShuffleIndexed, OperationCode::InRangeShuffleIndexed}; - case ShuffleOperation::Up: - return {OperationCode::ShuffleUp, OperationCode::InRangeShuffleUp}; + return Operation(OperationCode::IBitwiseOr, + Operation(OperationCode::IBitwiseAnd, index, neg_seg_mask), + min_thread_id); case ShuffleOperation::Down: - return {OperationCode::ShuffleDown, OperationCode::InRangeShuffleDown}; + return Operation(OperationCode::IAdd, thread_id, index); + case ShuffleOperation::Up: + return Operation(OperationCode::IAdd, thread_id, + Operation(OperationCode::INegate, index)); case ShuffleOperation::Bfly: - return {OperationCode::ShuffleButterfly, OperationCode::InRangeShuffleButterfly}; + return Operation(OperationCode::IBitwiseXor, thread_id, index); } - UNREACHABLE_MSG("Invalid SHFL operation: {}", - static_cast<u64>(instr.shfl.operation.Value())); - return {}; + UNREACHABLE(); + return Immediate(0U); }(); - // Setting the predicate before the register is intentional to avoid overwriting. - Node index = instr.shfl.is_index_imm ? Immediate(static_cast<u32>(instr.shfl.index_imm)) - : GetRegister(instr.gpr20); - SetPredicate(bb, instr.shfl.pred48, Operation(in_range, index, width)); + Node in_bounds = [instr, src_thread_id, min_thread_id, max_thread_id] { + if (instr.shfl.operation == ShuffleOperation::Up) { + return Operation(OperationCode::LogicalIGreaterEqual, src_thread_id, min_thread_id); + } else { + return Operation(OperationCode::LogicalILessEqual, src_thread_id, max_thread_id); + } + }(); + + SetPredicate(bb, instr.shfl.pred48, in_bounds); SetRegister( bb, instr.gpr0, - Operation(operation, GetRegister(instr.gpr8), std::move(index), std::move(width))); + Operation(OperationCode::ShuffleIndexed, GetRegister(instr.gpr8), src_thread_id)); + break; + } + case OpCode::Id::FSWZADD: { + UNIMPLEMENTED_IF(instr.fswzadd.ndv); + + Node op_a = GetRegister(instr.gpr8); + Node op_b = GetRegister(instr.gpr20); + Node mask = Immediate(static_cast<u32>(instr.fswzadd.swizzle)); + SetRegister(bb, instr.gpr0, Operation(OperationCode::FSwizzleAdd, op_a, op_b, mask)); break; } default: |