diff options
author | bunnei <bunneidev@gmail.com> | 2021-07-25 20:39:04 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-07-25 20:39:04 +0200 |
commit | 98b26b6e126d4775fdf3f773fe8a8ac808a8ff8f (patch) | |
tree | 816faa96c2c4d291825063433331a8ea4b3d08f1 /src/shader_recompiler/ir_opt | |
parent | Merge pull request #6699 from lat9nq/common-threads (diff) | |
parent | shader: Support out of bound local memory reads and immediate writes (diff) | |
download | yuzu-98b26b6e126d4775fdf3f773fe8a8ac808a8ff8f.tar yuzu-98b26b6e126d4775fdf3f773fe8a8ac808a8ff8f.tar.gz yuzu-98b26b6e126d4775fdf3f773fe8a8ac808a8ff8f.tar.bz2 yuzu-98b26b6e126d4775fdf3f773fe8a8ac808a8ff8f.tar.lz yuzu-98b26b6e126d4775fdf3f773fe8a8ac808a8ff8f.tar.xz yuzu-98b26b6e126d4775fdf3f773fe8a8ac808a8ff8f.tar.zst yuzu-98b26b6e126d4775fdf3f773fe8a8ac808a8ff8f.zip |
Diffstat (limited to 'src/shader_recompiler/ir_opt')
-rw-r--r-- | src/shader_recompiler/ir_opt/collect_shader_info_pass.cpp | 928 | ||||
-rw-r--r-- | src/shader_recompiler/ir_opt/constant_propagation_pass.cpp | 610 | ||||
-rw-r--r-- | src/shader_recompiler/ir_opt/dead_code_elimination_pass.cpp | 26 | ||||
-rw-r--r-- | src/shader_recompiler/ir_opt/dual_vertex_pass.cpp | 30 | ||||
-rw-r--r-- | src/shader_recompiler/ir_opt/global_memory_to_storage_buffer_pass.cpp | 526 | ||||
-rw-r--r-- | src/shader_recompiler/ir_opt/identity_removal_pass.cpp | 38 | ||||
-rw-r--r-- | src/shader_recompiler/ir_opt/lower_fp16_to_fp32.cpp | 143 | ||||
-rw-r--r-- | src/shader_recompiler/ir_opt/lower_int64_to_int32.cpp | 218 | ||||
-rw-r--r-- | src/shader_recompiler/ir_opt/passes.h | 32 | ||||
-rw-r--r-- | src/shader_recompiler/ir_opt/ssa_rewrite_pass.cpp | 383 | ||||
-rw-r--r-- | src/shader_recompiler/ir_opt/texture_pass.cpp | 523 | ||||
-rw-r--r-- | src/shader_recompiler/ir_opt/verification_pass.cpp | 98 |
12 files changed, 3555 insertions, 0 deletions
diff --git a/src/shader_recompiler/ir_opt/collect_shader_info_pass.cpp b/src/shader_recompiler/ir_opt/collect_shader_info_pass.cpp new file mode 100644 index 000000000..5ead930f1 --- /dev/null +++ b/src/shader_recompiler/ir_opt/collect_shader_info_pass.cpp @@ -0,0 +1,928 @@ +// Copyright 2021 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include "common/alignment.h" +#include "shader_recompiler/environment.h" +#include "shader_recompiler/frontend/ir/modifiers.h" +#include "shader_recompiler/frontend/ir/program.h" +#include "shader_recompiler/frontend/ir/value.h" +#include "shader_recompiler/ir_opt/passes.h" +#include "shader_recompiler/shader_info.h" + +namespace Shader::Optimization { +namespace { +void AddConstantBufferDescriptor(Info& info, u32 index, u32 count) { + if (count != 1) { + throw NotImplementedException("Constant buffer descriptor indexing"); + } + if ((info.constant_buffer_mask & (1U << index)) != 0) { + return; + } + info.constant_buffer_mask |= 1U << index; + + auto& cbufs{info.constant_buffer_descriptors}; + cbufs.insert(std::ranges::lower_bound(cbufs, index, {}, &ConstantBufferDescriptor::index), + ConstantBufferDescriptor{ + .index = index, + .count = 1, + }); +} + +void GetPatch(Info& info, IR::Patch patch) { + if (!IR::IsGeneric(patch)) { + throw NotImplementedException("Reading non-generic patch {}", patch); + } + info.uses_patches.at(IR::GenericPatchIndex(patch)) = true; +} + +void SetPatch(Info& info, IR::Patch patch) { + if (IR::IsGeneric(patch)) { + info.uses_patches.at(IR::GenericPatchIndex(patch)) = true; + return; + } + switch (patch) { + case IR::Patch::TessellationLodLeft: + case IR::Patch::TessellationLodTop: + case IR::Patch::TessellationLodRight: + case IR::Patch::TessellationLodBottom: + info.stores_tess_level_outer = true; + break; + case IR::Patch::TessellationLodInteriorU: + case IR::Patch::TessellationLodInteriorV: + info.stores_tess_level_inner = true; + break; + default: + throw NotImplementedException("Set patch {}", patch); + } +} + +void CheckCBufNVN(Info& info, IR::Inst& inst) { + const IR::Value cbuf_index{inst.Arg(0)}; + if (!cbuf_index.IsImmediate()) { + info.nvn_buffer_used.set(); + return; + } + const u32 index{cbuf_index.U32()}; + if (index != 0) { + return; + } + const IR::Value cbuf_offset{inst.Arg(1)}; + if (!cbuf_offset.IsImmediate()) { + info.nvn_buffer_used.set(); + return; + } + const u32 offset{cbuf_offset.U32()}; + const u32 descriptor_size{0x10}; + const u32 upper_limit{info.nvn_buffer_base + descriptor_size * 16}; + if (offset >= info.nvn_buffer_base && offset < upper_limit) { + const std::size_t nvn_index{(offset - info.nvn_buffer_base) / descriptor_size}; + info.nvn_buffer_used.set(nvn_index, true); + } +} + +void VisitUsages(Info& info, IR::Inst& inst) { + switch (inst.GetOpcode()) { + case IR::Opcode::CompositeConstructF16x2: + case IR::Opcode::CompositeConstructF16x3: + case IR::Opcode::CompositeConstructF16x4: + case IR::Opcode::CompositeExtractF16x2: + case IR::Opcode::CompositeExtractF16x3: + case IR::Opcode::CompositeExtractF16x4: + case IR::Opcode::CompositeInsertF16x2: + case IR::Opcode::CompositeInsertF16x3: + case IR::Opcode::CompositeInsertF16x4: + case IR::Opcode::SelectF16: + case IR::Opcode::BitCastU16F16: + case IR::Opcode::BitCastF16U16: + case IR::Opcode::PackFloat2x16: + case IR::Opcode::UnpackFloat2x16: + case IR::Opcode::ConvertS16F16: + case IR::Opcode::ConvertS32F16: + case IR::Opcode::ConvertS64F16: + case IR::Opcode::ConvertU16F16: + case IR::Opcode::ConvertU32F16: + case IR::Opcode::ConvertU64F16: + case IR::Opcode::ConvertF16S8: + case IR::Opcode::ConvertF16S16: + case IR::Opcode::ConvertF16S32: + case IR::Opcode::ConvertF16S64: + case IR::Opcode::ConvertF16U8: + case IR::Opcode::ConvertF16U16: + case IR::Opcode::ConvertF16U32: + case IR::Opcode::ConvertF16U64: + case IR::Opcode::FPAbs16: + case IR::Opcode::FPAdd16: + case IR::Opcode::FPCeil16: + case IR::Opcode::FPFloor16: + case IR::Opcode::FPFma16: + case IR::Opcode::FPMul16: + case IR::Opcode::FPNeg16: + case IR::Opcode::FPRoundEven16: + case IR::Opcode::FPSaturate16: + case IR::Opcode::FPClamp16: + case IR::Opcode::FPTrunc16: + case IR::Opcode::FPOrdEqual16: + case IR::Opcode::FPUnordEqual16: + case IR::Opcode::FPOrdNotEqual16: + case IR::Opcode::FPUnordNotEqual16: + case IR::Opcode::FPOrdLessThan16: + case IR::Opcode::FPUnordLessThan16: + case IR::Opcode::FPOrdGreaterThan16: + case IR::Opcode::FPUnordGreaterThan16: + case IR::Opcode::FPOrdLessThanEqual16: + case IR::Opcode::FPUnordLessThanEqual16: + case IR::Opcode::FPOrdGreaterThanEqual16: + case IR::Opcode::FPUnordGreaterThanEqual16: + case IR::Opcode::FPIsNan16: + case IR::Opcode::GlobalAtomicAddF16x2: + case IR::Opcode::GlobalAtomicMinF16x2: + case IR::Opcode::GlobalAtomicMaxF16x2: + case IR::Opcode::StorageAtomicAddF16x2: + case IR::Opcode::StorageAtomicMinF16x2: + case IR::Opcode::StorageAtomicMaxF16x2: + info.uses_fp16 = true; + break; + case IR::Opcode::CompositeConstructF64x2: + case IR::Opcode::CompositeConstructF64x3: + case IR::Opcode::CompositeConstructF64x4: + case IR::Opcode::CompositeExtractF64x2: + case IR::Opcode::CompositeExtractF64x3: + case IR::Opcode::CompositeExtractF64x4: + case IR::Opcode::CompositeInsertF64x2: + case IR::Opcode::CompositeInsertF64x3: + case IR::Opcode::CompositeInsertF64x4: + case IR::Opcode::SelectF64: + case IR::Opcode::BitCastU64F64: + case IR::Opcode::BitCastF64U64: + case IR::Opcode::PackDouble2x32: + case IR::Opcode::UnpackDouble2x32: + case IR::Opcode::FPAbs64: + case IR::Opcode::FPAdd64: + case IR::Opcode::FPCeil64: + case IR::Opcode::FPFloor64: + case IR::Opcode::FPFma64: + case IR::Opcode::FPMax64: + case IR::Opcode::FPMin64: + case IR::Opcode::FPMul64: + case IR::Opcode::FPNeg64: + case IR::Opcode::FPRecip64: + case IR::Opcode::FPRecipSqrt64: + case IR::Opcode::FPRoundEven64: + case IR::Opcode::FPSaturate64: + case IR::Opcode::FPClamp64: + case IR::Opcode::FPTrunc64: + case IR::Opcode::FPOrdEqual64: + case IR::Opcode::FPUnordEqual64: + case IR::Opcode::FPOrdNotEqual64: + case IR::Opcode::FPUnordNotEqual64: + case IR::Opcode::FPOrdLessThan64: + case IR::Opcode::FPUnordLessThan64: + case IR::Opcode::FPOrdGreaterThan64: + case IR::Opcode::FPUnordGreaterThan64: + case IR::Opcode::FPOrdLessThanEqual64: + case IR::Opcode::FPUnordLessThanEqual64: + case IR::Opcode::FPOrdGreaterThanEqual64: + case IR::Opcode::FPUnordGreaterThanEqual64: + case IR::Opcode::FPIsNan64: + case IR::Opcode::ConvertS16F64: + case IR::Opcode::ConvertS32F64: + case IR::Opcode::ConvertS64F64: + case IR::Opcode::ConvertU16F64: + case IR::Opcode::ConvertU32F64: + case IR::Opcode::ConvertU64F64: + case IR::Opcode::ConvertF32F64: + case IR::Opcode::ConvertF64F32: + case IR::Opcode::ConvertF64S8: + case IR::Opcode::ConvertF64S16: + case IR::Opcode::ConvertF64S32: + case IR::Opcode::ConvertF64S64: + case IR::Opcode::ConvertF64U8: + case IR::Opcode::ConvertF64U16: + case IR::Opcode::ConvertF64U32: + case IR::Opcode::ConvertF64U64: + info.uses_fp64 = true; + break; + default: + break; + } + switch (inst.GetOpcode()) { + case IR::Opcode::GetCbufU8: + case IR::Opcode::GetCbufS8: + case IR::Opcode::UndefU8: + case IR::Opcode::LoadGlobalU8: + case IR::Opcode::LoadGlobalS8: + case IR::Opcode::WriteGlobalU8: + case IR::Opcode::WriteGlobalS8: + case IR::Opcode::LoadStorageU8: + case IR::Opcode::LoadStorageS8: + case IR::Opcode::WriteStorageU8: + case IR::Opcode::WriteStorageS8: + case IR::Opcode::LoadSharedU8: + case IR::Opcode::LoadSharedS8: + case IR::Opcode::WriteSharedU8: + case IR::Opcode::SelectU8: + case IR::Opcode::ConvertF16S8: + case IR::Opcode::ConvertF16U8: + case IR::Opcode::ConvertF32S8: + case IR::Opcode::ConvertF32U8: + case IR::Opcode::ConvertF64S8: + case IR::Opcode::ConvertF64U8: + info.uses_int8 = true; + break; + default: + break; + } + switch (inst.GetOpcode()) { + case IR::Opcode::GetCbufU16: + case IR::Opcode::GetCbufS16: + case IR::Opcode::UndefU16: + case IR::Opcode::LoadGlobalU16: + case IR::Opcode::LoadGlobalS16: + case IR::Opcode::WriteGlobalU16: + case IR::Opcode::WriteGlobalS16: + case IR::Opcode::LoadStorageU16: + case IR::Opcode::LoadStorageS16: + case IR::Opcode::WriteStorageU16: + case IR::Opcode::WriteStorageS16: + case IR::Opcode::LoadSharedU16: + case IR::Opcode::LoadSharedS16: + case IR::Opcode::WriteSharedU16: + case IR::Opcode::SelectU16: + case IR::Opcode::BitCastU16F16: + case IR::Opcode::BitCastF16U16: + case IR::Opcode::ConvertS16F16: + case IR::Opcode::ConvertS16F32: + case IR::Opcode::ConvertS16F64: + case IR::Opcode::ConvertU16F16: + case IR::Opcode::ConvertU16F32: + case IR::Opcode::ConvertU16F64: + case IR::Opcode::ConvertF16S16: + case IR::Opcode::ConvertF16U16: + case IR::Opcode::ConvertF32S16: + case IR::Opcode::ConvertF32U16: + case IR::Opcode::ConvertF64S16: + case IR::Opcode::ConvertF64U16: + info.uses_int16 = true; + break; + default: + break; + } + switch (inst.GetOpcode()) { + case IR::Opcode::UndefU64: + case IR::Opcode::LoadGlobalU8: + case IR::Opcode::LoadGlobalS8: + case IR::Opcode::LoadGlobalU16: + case IR::Opcode::LoadGlobalS16: + case IR::Opcode::LoadGlobal32: + case IR::Opcode::LoadGlobal64: + case IR::Opcode::LoadGlobal128: + case IR::Opcode::WriteGlobalU8: + case IR::Opcode::WriteGlobalS8: + case IR::Opcode::WriteGlobalU16: + case IR::Opcode::WriteGlobalS16: + case IR::Opcode::WriteGlobal32: + case IR::Opcode::WriteGlobal64: + case IR::Opcode::WriteGlobal128: + case IR::Opcode::SelectU64: + case IR::Opcode::BitCastU64F64: + case IR::Opcode::BitCastF64U64: + case IR::Opcode::PackUint2x32: + case IR::Opcode::UnpackUint2x32: + case IR::Opcode::IAdd64: + case IR::Opcode::ISub64: + case IR::Opcode::INeg64: + case IR::Opcode::ShiftLeftLogical64: + case IR::Opcode::ShiftRightLogical64: + case IR::Opcode::ShiftRightArithmetic64: + case IR::Opcode::ConvertS64F16: + case IR::Opcode::ConvertS64F32: + case IR::Opcode::ConvertS64F64: + case IR::Opcode::ConvertU64F16: + case IR::Opcode::ConvertU64F32: + case IR::Opcode::ConvertU64F64: + case IR::Opcode::ConvertU64U32: + case IR::Opcode::ConvertU32U64: + case IR::Opcode::ConvertF16U64: + case IR::Opcode::ConvertF32U64: + case IR::Opcode::ConvertF64U64: + case IR::Opcode::SharedAtomicExchange64: + case IR::Opcode::GlobalAtomicIAdd64: + case IR::Opcode::GlobalAtomicSMin64: + case IR::Opcode::GlobalAtomicUMin64: + case IR::Opcode::GlobalAtomicSMax64: + case IR::Opcode::GlobalAtomicUMax64: + case IR::Opcode::GlobalAtomicAnd64: + case IR::Opcode::GlobalAtomicOr64: + case IR::Opcode::GlobalAtomicXor64: + case IR::Opcode::GlobalAtomicExchange64: + case IR::Opcode::StorageAtomicIAdd64: + case IR::Opcode::StorageAtomicSMin64: + case IR::Opcode::StorageAtomicUMin64: + case IR::Opcode::StorageAtomicSMax64: + case IR::Opcode::StorageAtomicUMax64: + case IR::Opcode::StorageAtomicAnd64: + case IR::Opcode::StorageAtomicOr64: + case IR::Opcode::StorageAtomicXor64: + case IR::Opcode::StorageAtomicExchange64: + info.uses_int64 = true; + break; + default: + break; + } + switch (inst.GetOpcode()) { + case IR::Opcode::WriteGlobalU8: + case IR::Opcode::WriteGlobalS8: + case IR::Opcode::WriteGlobalU16: + case IR::Opcode::WriteGlobalS16: + case IR::Opcode::WriteGlobal32: + case IR::Opcode::WriteGlobal64: + case IR::Opcode::WriteGlobal128: + case IR::Opcode::GlobalAtomicIAdd32: + case IR::Opcode::GlobalAtomicSMin32: + case IR::Opcode::GlobalAtomicUMin32: + case IR::Opcode::GlobalAtomicSMax32: + case IR::Opcode::GlobalAtomicUMax32: + case IR::Opcode::GlobalAtomicInc32: + case IR::Opcode::GlobalAtomicDec32: + case IR::Opcode::GlobalAtomicAnd32: + case IR::Opcode::GlobalAtomicOr32: + case IR::Opcode::GlobalAtomicXor32: + case IR::Opcode::GlobalAtomicExchange32: + case IR::Opcode::GlobalAtomicIAdd64: + case IR::Opcode::GlobalAtomicSMin64: + case IR::Opcode::GlobalAtomicUMin64: + case IR::Opcode::GlobalAtomicSMax64: + case IR::Opcode::GlobalAtomicUMax64: + case IR::Opcode::GlobalAtomicAnd64: + case IR::Opcode::GlobalAtomicOr64: + case IR::Opcode::GlobalAtomicXor64: + case IR::Opcode::GlobalAtomicExchange64: + case IR::Opcode::GlobalAtomicAddF32: + case IR::Opcode::GlobalAtomicAddF16x2: + case IR::Opcode::GlobalAtomicAddF32x2: + case IR::Opcode::GlobalAtomicMinF16x2: + case IR::Opcode::GlobalAtomicMinF32x2: + case IR::Opcode::GlobalAtomicMaxF16x2: + case IR::Opcode::GlobalAtomicMaxF32x2: + info.stores_global_memory = true; + [[fallthrough]]; + case IR::Opcode::LoadGlobalU8: + case IR::Opcode::LoadGlobalS8: + case IR::Opcode::LoadGlobalU16: + case IR::Opcode::LoadGlobalS16: + case IR::Opcode::LoadGlobal32: + case IR::Opcode::LoadGlobal64: + case IR::Opcode::LoadGlobal128: + info.uses_int64 = true; + info.uses_global_memory = true; + info.used_constant_buffer_types |= IR::Type::U32 | IR::Type::U32x2; + info.used_storage_buffer_types |= IR::Type::U32 | IR::Type::U32x2 | IR::Type::U32x4; + break; + default: + break; + } + switch (inst.GetOpcode()) { + case IR::Opcode::DemoteToHelperInvocation: + info.uses_demote_to_helper_invocation = true; + break; + case IR::Opcode::GetAttribute: + info.loads.mask[static_cast<size_t>(inst.Arg(0).Attribute())] = true; + break; + case IR::Opcode::SetAttribute: + info.stores.mask[static_cast<size_t>(inst.Arg(0).Attribute())] = true; + break; + case IR::Opcode::GetPatch: + GetPatch(info, inst.Arg(0).Patch()); + break; + case IR::Opcode::SetPatch: + SetPatch(info, inst.Arg(0).Patch()); + break; + case IR::Opcode::GetAttributeIndexed: + info.loads_indexed_attributes = true; + break; + case IR::Opcode::SetAttributeIndexed: + info.stores_indexed_attributes = true; + break; + case IR::Opcode::SetFragColor: + info.stores_frag_color[inst.Arg(0).U32()] = true; + break; + case IR::Opcode::SetSampleMask: + info.stores_sample_mask = true; + break; + case IR::Opcode::SetFragDepth: + info.stores_frag_depth = true; + break; + case IR::Opcode::WorkgroupId: + info.uses_workgroup_id = true; + break; + case IR::Opcode::LocalInvocationId: + info.uses_local_invocation_id = true; + break; + case IR::Opcode::InvocationId: + info.uses_invocation_id = true; + break; + case IR::Opcode::SampleId: + info.uses_sample_id = true; + break; + case IR::Opcode::IsHelperInvocation: + info.uses_is_helper_invocation = true; + break; + case IR::Opcode::LaneId: + info.uses_subgroup_invocation_id = true; + break; + case IR::Opcode::ShuffleIndex: + case IR::Opcode::ShuffleUp: + case IR::Opcode::ShuffleDown: + case IR::Opcode::ShuffleButterfly: + info.uses_subgroup_shuffles = true; + break; + case IR::Opcode::GetCbufU8: + case IR::Opcode::GetCbufS8: + case IR::Opcode::GetCbufU16: + case IR::Opcode::GetCbufS16: + case IR::Opcode::GetCbufU32: + case IR::Opcode::GetCbufF32: + case IR::Opcode::GetCbufU32x2: { + const IR::Value index{inst.Arg(0)}; + const IR::Value offset{inst.Arg(1)}; + if (!index.IsImmediate()) { + throw NotImplementedException("Constant buffer with non-immediate index"); + } + AddConstantBufferDescriptor(info, index.U32(), 1); + u32 element_size{}; + switch (inst.GetOpcode()) { + case IR::Opcode::GetCbufU8: + case IR::Opcode::GetCbufS8: + info.used_constant_buffer_types |= IR::Type::U8; + element_size = 1; + break; + case IR::Opcode::GetCbufU16: + case IR::Opcode::GetCbufS16: + info.used_constant_buffer_types |= IR::Type::U16; + element_size = 2; + break; + case IR::Opcode::GetCbufU32: + info.used_constant_buffer_types |= IR::Type::U32; + element_size = 4; + break; + case IR::Opcode::GetCbufF32: + info.used_constant_buffer_types |= IR::Type::F32; + element_size = 4; + break; + case IR::Opcode::GetCbufU32x2: + info.used_constant_buffer_types |= IR::Type::U32x2; + element_size = 8; + break; + default: + break; + } + u32& size{info.constant_buffer_used_sizes[index.U32()]}; + if (offset.IsImmediate()) { + size = Common::AlignUp(std::max(size, offset.U32() + element_size), 16u); + } else { + size = 0x10'000; + } + break; + } + case IR::Opcode::BindlessImageSampleImplicitLod: + case IR::Opcode::BindlessImageSampleExplicitLod: + case IR::Opcode::BindlessImageSampleDrefImplicitLod: + case IR::Opcode::BindlessImageSampleDrefExplicitLod: + case IR::Opcode::BindlessImageGather: + case IR::Opcode::BindlessImageGatherDref: + case IR::Opcode::BindlessImageFetch: + case IR::Opcode::BindlessImageQueryDimensions: + case IR::Opcode::BindlessImageQueryLod: + case IR::Opcode::BindlessImageGradient: + case IR::Opcode::BoundImageSampleImplicitLod: + case IR::Opcode::BoundImageSampleExplicitLod: + case IR::Opcode::BoundImageSampleDrefImplicitLod: + case IR::Opcode::BoundImageSampleDrefExplicitLod: + case IR::Opcode::BoundImageGather: + case IR::Opcode::BoundImageGatherDref: + case IR::Opcode::BoundImageFetch: + case IR::Opcode::BoundImageQueryDimensions: + case IR::Opcode::BoundImageQueryLod: + case IR::Opcode::BoundImageGradient: + case IR::Opcode::ImageGather: + case IR::Opcode::ImageGatherDref: + case IR::Opcode::ImageFetch: + case IR::Opcode::ImageQueryDimensions: + case IR::Opcode::ImageGradient: { + const TextureType type{inst.Flags<IR::TextureInstInfo>().type}; + info.uses_sampled_1d |= type == TextureType::Color1D || type == TextureType::ColorArray1D; + info.uses_sparse_residency |= + inst.GetAssociatedPseudoOperation(IR::Opcode::GetSparseFromOp) != nullptr; + break; + } + case IR::Opcode::ImageSampleImplicitLod: + case IR::Opcode::ImageSampleExplicitLod: + case IR::Opcode::ImageSampleDrefImplicitLod: + case IR::Opcode::ImageSampleDrefExplicitLod: + case IR::Opcode::ImageQueryLod: { + const auto flags{inst.Flags<IR::TextureInstInfo>()}; + const TextureType type{flags.type}; + info.uses_sampled_1d |= type == TextureType::Color1D || type == TextureType::ColorArray1D; + info.uses_shadow_lod |= flags.is_depth != 0; + info.uses_sparse_residency |= + inst.GetAssociatedPseudoOperation(IR::Opcode::GetSparseFromOp) != nullptr; + break; + } + case IR::Opcode::ImageRead: { + const auto flags{inst.Flags<IR::TextureInstInfo>()}; + info.uses_typeless_image_reads |= flags.image_format == ImageFormat::Typeless; + info.uses_sparse_residency |= + inst.GetAssociatedPseudoOperation(IR::Opcode::GetSparseFromOp) != nullptr; + break; + } + case IR::Opcode::ImageWrite: { + const auto flags{inst.Flags<IR::TextureInstInfo>()}; + info.uses_typeless_image_writes |= flags.image_format == ImageFormat::Typeless; + info.uses_image_buffers |= flags.type == TextureType::Buffer; + break; + } + case IR::Opcode::SubgroupEqMask: + case IR::Opcode::SubgroupLtMask: + case IR::Opcode::SubgroupLeMask: + case IR::Opcode::SubgroupGtMask: + case IR::Opcode::SubgroupGeMask: + info.uses_subgroup_mask = true; + break; + case IR::Opcode::VoteAll: + case IR::Opcode::VoteAny: + case IR::Opcode::VoteEqual: + case IR::Opcode::SubgroupBallot: + info.uses_subgroup_vote = true; + break; + case IR::Opcode::FSwizzleAdd: + info.uses_fswzadd = true; + break; + case IR::Opcode::DPdxFine: + case IR::Opcode::DPdyFine: + case IR::Opcode::DPdxCoarse: + case IR::Opcode::DPdyCoarse: + info.uses_derivatives = true; + break; + case IR::Opcode::LoadStorageU8: + case IR::Opcode::LoadStorageS8: + case IR::Opcode::WriteStorageU8: + case IR::Opcode::WriteStorageS8: + info.used_storage_buffer_types |= IR::Type::U8; + break; + case IR::Opcode::LoadStorageU16: + case IR::Opcode::LoadStorageS16: + case IR::Opcode::WriteStorageU16: + case IR::Opcode::WriteStorageS16: + info.used_storage_buffer_types |= IR::Type::U16; + break; + case IR::Opcode::LoadStorage32: + case IR::Opcode::WriteStorage32: + case IR::Opcode::StorageAtomicIAdd32: + case IR::Opcode::StorageAtomicUMin32: + case IR::Opcode::StorageAtomicUMax32: + case IR::Opcode::StorageAtomicAnd32: + case IR::Opcode::StorageAtomicOr32: + case IR::Opcode::StorageAtomicXor32: + case IR::Opcode::StorageAtomicExchange32: + info.used_storage_buffer_types |= IR::Type::U32; + break; + case IR::Opcode::LoadStorage64: + case IR::Opcode::WriteStorage64: + info.used_storage_buffer_types |= IR::Type::U32x2; + break; + case IR::Opcode::LoadStorage128: + case IR::Opcode::WriteStorage128: + info.used_storage_buffer_types |= IR::Type::U32x4; + break; + case IR::Opcode::SharedAtomicSMin32: + info.uses_atomic_s32_min = true; + break; + case IR::Opcode::SharedAtomicSMax32: + info.uses_atomic_s32_max = true; + break; + case IR::Opcode::SharedAtomicInc32: + info.uses_shared_increment = true; + break; + case IR::Opcode::SharedAtomicDec32: + info.uses_shared_decrement = true; + break; + case IR::Opcode::SharedAtomicExchange64: + info.uses_int64_bit_atomics = true; + break; + case IR::Opcode::GlobalAtomicInc32: + case IR::Opcode::StorageAtomicInc32: + info.used_storage_buffer_types |= IR::Type::U32; + info.uses_global_increment = true; + break; + case IR::Opcode::GlobalAtomicDec32: + case IR::Opcode::StorageAtomicDec32: + info.used_storage_buffer_types |= IR::Type::U32; + info.uses_global_decrement = true; + break; + case IR::Opcode::GlobalAtomicAddF32: + case IR::Opcode::StorageAtomicAddF32: + info.used_storage_buffer_types |= IR::Type::U32; + info.uses_atomic_f32_add = true; + break; + case IR::Opcode::GlobalAtomicAddF16x2: + case IR::Opcode::StorageAtomicAddF16x2: + info.used_storage_buffer_types |= IR::Type::U32; + info.uses_atomic_f16x2_add = true; + break; + case IR::Opcode::GlobalAtomicAddF32x2: + case IR::Opcode::StorageAtomicAddF32x2: + info.used_storage_buffer_types |= IR::Type::U32; + info.uses_atomic_f32x2_add = true; + break; + case IR::Opcode::GlobalAtomicMinF16x2: + case IR::Opcode::StorageAtomicMinF16x2: + info.used_storage_buffer_types |= IR::Type::U32; + info.uses_atomic_f16x2_min = true; + break; + case IR::Opcode::GlobalAtomicMinF32x2: + case IR::Opcode::StorageAtomicMinF32x2: + info.used_storage_buffer_types |= IR::Type::U32; + info.uses_atomic_f32x2_min = true; + break; + case IR::Opcode::GlobalAtomicMaxF16x2: + case IR::Opcode::StorageAtomicMaxF16x2: + info.used_storage_buffer_types |= IR::Type::U32; + info.uses_atomic_f16x2_max = true; + break; + case IR::Opcode::GlobalAtomicMaxF32x2: + case IR::Opcode::StorageAtomicMaxF32x2: + info.used_storage_buffer_types |= IR::Type::U32; + info.uses_atomic_f32x2_max = true; + break; + case IR::Opcode::StorageAtomicSMin32: + info.used_storage_buffer_types |= IR::Type::U32; + info.uses_atomic_s32_min = true; + break; + case IR::Opcode::StorageAtomicSMax32: + info.used_storage_buffer_types |= IR::Type::U32; + info.uses_atomic_s32_max = true; + break; + case IR::Opcode::GlobalAtomicIAdd64: + case IR::Opcode::GlobalAtomicSMin64: + case IR::Opcode::GlobalAtomicUMin64: + case IR::Opcode::GlobalAtomicSMax64: + case IR::Opcode::GlobalAtomicUMax64: + case IR::Opcode::GlobalAtomicAnd64: + case IR::Opcode::GlobalAtomicOr64: + case IR::Opcode::GlobalAtomicXor64: + case IR::Opcode::GlobalAtomicExchange64: + case IR::Opcode::StorageAtomicIAdd64: + case IR::Opcode::StorageAtomicSMin64: + case IR::Opcode::StorageAtomicUMin64: + case IR::Opcode::StorageAtomicSMax64: + case IR::Opcode::StorageAtomicUMax64: + case IR::Opcode::StorageAtomicAnd64: + case IR::Opcode::StorageAtomicOr64: + case IR::Opcode::StorageAtomicXor64: + info.used_storage_buffer_types |= IR::Type::U64; + info.uses_int64_bit_atomics = true; + break; + case IR::Opcode::BindlessImageAtomicIAdd32: + case IR::Opcode::BindlessImageAtomicSMin32: + case IR::Opcode::BindlessImageAtomicUMin32: + case IR::Opcode::BindlessImageAtomicSMax32: + case IR::Opcode::BindlessImageAtomicUMax32: + case IR::Opcode::BindlessImageAtomicInc32: + case IR::Opcode::BindlessImageAtomicDec32: + case IR::Opcode::BindlessImageAtomicAnd32: + case IR::Opcode::BindlessImageAtomicOr32: + case IR::Opcode::BindlessImageAtomicXor32: + case IR::Opcode::BindlessImageAtomicExchange32: + case IR::Opcode::BoundImageAtomicIAdd32: + case IR::Opcode::BoundImageAtomicSMin32: + case IR::Opcode::BoundImageAtomicUMin32: + case IR::Opcode::BoundImageAtomicSMax32: + case IR::Opcode::BoundImageAtomicUMax32: + case IR::Opcode::BoundImageAtomicInc32: + case IR::Opcode::BoundImageAtomicDec32: + case IR::Opcode::BoundImageAtomicAnd32: + case IR::Opcode::BoundImageAtomicOr32: + case IR::Opcode::BoundImageAtomicXor32: + case IR::Opcode::BoundImageAtomicExchange32: + case IR::Opcode::ImageAtomicIAdd32: + case IR::Opcode::ImageAtomicSMin32: + case IR::Opcode::ImageAtomicUMin32: + case IR::Opcode::ImageAtomicSMax32: + case IR::Opcode::ImageAtomicUMax32: + case IR::Opcode::ImageAtomicInc32: + case IR::Opcode::ImageAtomicDec32: + case IR::Opcode::ImageAtomicAnd32: + case IR::Opcode::ImageAtomicOr32: + case IR::Opcode::ImageAtomicXor32: + case IR::Opcode::ImageAtomicExchange32: + info.uses_atomic_image_u32 = true; + break; + default: + break; + } +} + +void VisitFpModifiers(Info& info, IR::Inst& inst) { + switch (inst.GetOpcode()) { + case IR::Opcode::FPAdd16: + case IR::Opcode::FPFma16: + case IR::Opcode::FPMul16: + case IR::Opcode::FPRoundEven16: + case IR::Opcode::FPFloor16: + case IR::Opcode::FPCeil16: + case IR::Opcode::FPTrunc16: { + const auto control{inst.Flags<IR::FpControl>()}; + switch (control.fmz_mode) { + case IR::FmzMode::DontCare: + break; + case IR::FmzMode::FTZ: + case IR::FmzMode::FMZ: + info.uses_fp16_denorms_flush = true; + break; + case IR::FmzMode::None: + info.uses_fp16_denorms_preserve = true; + break; + } + break; + } + case IR::Opcode::FPAdd32: + case IR::Opcode::FPFma32: + case IR::Opcode::FPMul32: + case IR::Opcode::FPRoundEven32: + case IR::Opcode::FPFloor32: + case IR::Opcode::FPCeil32: + case IR::Opcode::FPTrunc32: + case IR::Opcode::FPOrdEqual32: + case IR::Opcode::FPUnordEqual32: + case IR::Opcode::FPOrdNotEqual32: + case IR::Opcode::FPUnordNotEqual32: + case IR::Opcode::FPOrdLessThan32: + case IR::Opcode::FPUnordLessThan32: + case IR::Opcode::FPOrdGreaterThan32: + case IR::Opcode::FPUnordGreaterThan32: + case IR::Opcode::FPOrdLessThanEqual32: + case IR::Opcode::FPUnordLessThanEqual32: + case IR::Opcode::FPOrdGreaterThanEqual32: + case IR::Opcode::FPUnordGreaterThanEqual32: + case IR::Opcode::ConvertF16F32: + case IR::Opcode::ConvertF64F32: { + const auto control{inst.Flags<IR::FpControl>()}; + switch (control.fmz_mode) { + case IR::FmzMode::DontCare: + break; + case IR::FmzMode::FTZ: + case IR::FmzMode::FMZ: + info.uses_fp32_denorms_flush = true; + break; + case IR::FmzMode::None: + info.uses_fp32_denorms_preserve = true; + break; + } + break; + } + default: + break; + } +} + +void VisitCbufs(Info& info, IR::Inst& inst) { + switch (inst.GetOpcode()) { + case IR::Opcode::GetCbufU8: + case IR::Opcode::GetCbufS8: + case IR::Opcode::GetCbufU16: + case IR::Opcode::GetCbufS16: + case IR::Opcode::GetCbufU32: + case IR::Opcode::GetCbufF32: + case IR::Opcode::GetCbufU32x2: { + CheckCBufNVN(info, inst); + break; + } + default: + break; + } +} + +void Visit(Info& info, IR::Inst& inst) { + VisitUsages(info, inst); + VisitFpModifiers(info, inst); + VisitCbufs(info, inst); +} + +void GatherInfoFromHeader(Environment& env, Info& info) { + Stage stage{env.ShaderStage()}; + if (stage == Stage::Compute) { + return; + } + const auto& header{env.SPH()}; + if (stage == Stage::Fragment) { + if (!info.loads_indexed_attributes) { + return; + } + for (size_t index = 0; index < IR::NUM_GENERICS; ++index) { + const size_t offset{static_cast<size_t>(IR::Attribute::Generic0X) + index * 4}; + const auto vector{header.ps.imap_generic_vector[index]}; + info.loads.mask[offset + 0] = vector.x != PixelImap::Unused; + info.loads.mask[offset + 1] = vector.y != PixelImap::Unused; + info.loads.mask[offset + 2] = vector.z != PixelImap::Unused; + info.loads.mask[offset + 3] = vector.w != PixelImap::Unused; + } + return; + } + if (info.loads_indexed_attributes) { + for (size_t index = 0; index < IR::NUM_GENERICS; ++index) { + const IR::Attribute attribute{IR::Attribute::Generic0X + index * 4}; + const auto mask = header.vtg.InputGeneric(index); + for (size_t i = 0; i < 4; ++i) { + info.loads.Set(attribute + i, mask[i]); + } + } + for (size_t index = 0; index < 8; ++index) { + const u16 mask{header.vtg.clip_distances}; + info.loads.Set(IR::Attribute::ClipDistance0 + index, ((mask >> index) & 1) != 0); + } + info.loads.Set(IR::Attribute::PrimitiveId, header.vtg.imap_systemb.primitive_array_id != 0); + info.loads.Set(IR::Attribute::Layer, header.vtg.imap_systemb.rt_array_index != 0); + info.loads.Set(IR::Attribute::ViewportIndex, header.vtg.imap_systemb.viewport_index != 0); + info.loads.Set(IR::Attribute::PointSize, header.vtg.imap_systemb.point_size != 0); + info.loads.Set(IR::Attribute::PositionX, header.vtg.imap_systemb.position_x != 0); + info.loads.Set(IR::Attribute::PositionY, header.vtg.imap_systemb.position_y != 0); + info.loads.Set(IR::Attribute::PositionZ, header.vtg.imap_systemb.position_z != 0); + info.loads.Set(IR::Attribute::PositionW, header.vtg.imap_systemb.position_w != 0); + info.loads.Set(IR::Attribute::PointSpriteS, header.vtg.point_sprite_s != 0); + info.loads.Set(IR::Attribute::PointSpriteT, header.vtg.point_sprite_t != 0); + info.loads.Set(IR::Attribute::FogCoordinate, header.vtg.fog_coordinate != 0); + info.loads.Set(IR::Attribute::TessellationEvaluationPointU, + header.vtg.tessellation_eval_point_u != 0); + info.loads.Set(IR::Attribute::TessellationEvaluationPointV, + header.vtg.tessellation_eval_point_v != 0); + info.loads.Set(IR::Attribute::InstanceId, header.vtg.instance_id != 0); + info.loads.Set(IR::Attribute::VertexId, header.vtg.vertex_id != 0); + // TODO: Legacy varyings + } + if (info.stores_indexed_attributes) { + for (size_t index = 0; index < IR::NUM_GENERICS; ++index) { + const IR::Attribute attribute{IR::Attribute::Generic0X + index * 4}; + const auto mask{header.vtg.OutputGeneric(index)}; + for (size_t i = 0; i < 4; ++i) { + info.stores.Set(attribute + i, mask[i]); + } + } + for (size_t index = 0; index < 8; ++index) { + const u16 mask{header.vtg.omap_systemc.clip_distances}; + info.stores.Set(IR::Attribute::ClipDistance0 + index, ((mask >> index) & 1) != 0); + } + info.stores.Set(IR::Attribute::PrimitiveId, + header.vtg.omap_systemb.primitive_array_id != 0); + info.stores.Set(IR::Attribute::Layer, header.vtg.omap_systemb.rt_array_index != 0); + info.stores.Set(IR::Attribute::ViewportIndex, header.vtg.omap_systemb.viewport_index != 0); + info.stores.Set(IR::Attribute::PointSize, header.vtg.omap_systemb.point_size != 0); + info.stores.Set(IR::Attribute::PositionX, header.vtg.omap_systemb.position_x != 0); + info.stores.Set(IR::Attribute::PositionY, header.vtg.omap_systemb.position_y != 0); + info.stores.Set(IR::Attribute::PositionZ, header.vtg.omap_systemb.position_z != 0); + info.stores.Set(IR::Attribute::PositionW, header.vtg.omap_systemb.position_w != 0); + info.stores.Set(IR::Attribute::PointSpriteS, header.vtg.omap_systemc.point_sprite_s != 0); + info.stores.Set(IR::Attribute::PointSpriteT, header.vtg.omap_systemc.point_sprite_t != 0); + info.stores.Set(IR::Attribute::FogCoordinate, header.vtg.omap_systemc.fog_coordinate != 0); + info.stores.Set(IR::Attribute::TessellationEvaluationPointU, + header.vtg.omap_systemc.tessellation_eval_point_u != 0); + info.stores.Set(IR::Attribute::TessellationEvaluationPointV, + header.vtg.omap_systemc.tessellation_eval_point_v != 0); + info.stores.Set(IR::Attribute::InstanceId, header.vtg.omap_systemc.instance_id != 0); + info.stores.Set(IR::Attribute::VertexId, header.vtg.omap_systemc.vertex_id != 0); + // TODO: Legacy varyings + } +} +} // Anonymous namespace + +void CollectShaderInfoPass(Environment& env, IR::Program& program) { + Info& info{program.info}; + const u32 base{[&] { + switch (program.stage) { + case Stage::VertexA: + case Stage::VertexB: + return 0x110u; + case Stage::TessellationControl: + return 0x210u; + case Stage::TessellationEval: + return 0x310u; + case Stage::Geometry: + return 0x410u; + case Stage::Fragment: + return 0x510u; + case Stage::Compute: + return 0x310u; + } + throw InvalidArgument("Invalid stage {}", program.stage); + }()}; + info.nvn_buffer_base = base; + + for (IR::Block* const block : program.post_order_blocks) { + for (IR::Inst& inst : block->Instructions()) { + Visit(info, inst); + } + } + GatherInfoFromHeader(env, info); +} + +} // namespace Shader::Optimization diff --git a/src/shader_recompiler/ir_opt/constant_propagation_pass.cpp b/src/shader_recompiler/ir_opt/constant_propagation_pass.cpp new file mode 100644 index 000000000..8dd6d6c2c --- /dev/null +++ b/src/shader_recompiler/ir_opt/constant_propagation_pass.cpp @@ -0,0 +1,610 @@ +// Copyright 2021 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <algorithm> +#include <tuple> +#include <type_traits> + +#include "common/bit_cast.h" +#include "common/bit_util.h" +#include "shader_recompiler/exception.h" +#include "shader_recompiler/frontend/ir/ir_emitter.h" +#include "shader_recompiler/frontend/ir/value.h" +#include "shader_recompiler/ir_opt/passes.h" + +namespace Shader::Optimization { +namespace { +// Metaprogramming stuff to get arguments information out of a lambda +template <typename Func> +struct LambdaTraits : LambdaTraits<decltype(&std::remove_reference_t<Func>::operator())> {}; + +template <typename ReturnType, typename LambdaType, typename... Args> +struct LambdaTraits<ReturnType (LambdaType::*)(Args...) const> { + template <size_t I> + using ArgType = std::tuple_element_t<I, std::tuple<Args...>>; + + static constexpr size_t NUM_ARGS{sizeof...(Args)}; +}; + +template <typename T> +[[nodiscard]] T Arg(const IR::Value& value) { + if constexpr (std::is_same_v<T, bool>) { + return value.U1(); + } else if constexpr (std::is_same_v<T, u32>) { + return value.U32(); + } else if constexpr (std::is_same_v<T, s32>) { + return static_cast<s32>(value.U32()); + } else if constexpr (std::is_same_v<T, f32>) { + return value.F32(); + } else if constexpr (std::is_same_v<T, u64>) { + return value.U64(); + } +} + +template <typename T, typename ImmFn> +bool FoldCommutative(IR::Inst& inst, ImmFn&& imm_fn) { + const IR::Value lhs{inst.Arg(0)}; + const IR::Value rhs{inst.Arg(1)}; + + const bool is_lhs_immediate{lhs.IsImmediate()}; + const bool is_rhs_immediate{rhs.IsImmediate()}; + + if (is_lhs_immediate && is_rhs_immediate) { + const auto result{imm_fn(Arg<T>(lhs), Arg<T>(rhs))}; + inst.ReplaceUsesWith(IR::Value{result}); + return false; + } + if (is_lhs_immediate && !is_rhs_immediate) { + IR::Inst* const rhs_inst{rhs.InstRecursive()}; + if (rhs_inst->GetOpcode() == inst.GetOpcode() && rhs_inst->Arg(1).IsImmediate()) { + const auto combined{imm_fn(Arg<T>(lhs), Arg<T>(rhs_inst->Arg(1)))}; + inst.SetArg(0, rhs_inst->Arg(0)); + inst.SetArg(1, IR::Value{combined}); + } else { + // Normalize + inst.SetArg(0, rhs); + inst.SetArg(1, lhs); + } + } + if (!is_lhs_immediate && is_rhs_immediate) { + const IR::Inst* const lhs_inst{lhs.InstRecursive()}; + if (lhs_inst->GetOpcode() == inst.GetOpcode() && lhs_inst->Arg(1).IsImmediate()) { + const auto combined{imm_fn(Arg<T>(rhs), Arg<T>(lhs_inst->Arg(1)))}; + inst.SetArg(0, lhs_inst->Arg(0)); + inst.SetArg(1, IR::Value{combined}); + } + } + return true; +} + +template <typename Func> +bool FoldWhenAllImmediates(IR::Inst& inst, Func&& func) { + if (!inst.AreAllArgsImmediates() || inst.HasAssociatedPseudoOperation()) { + return false; + } + using Indices = std::make_index_sequence<LambdaTraits<decltype(func)>::NUM_ARGS>; + inst.ReplaceUsesWith(EvalImmediates(inst, func, Indices{})); + return true; +} + +void FoldGetRegister(IR::Inst& inst) { + if (inst.Arg(0).Reg() == IR::Reg::RZ) { + inst.ReplaceUsesWith(IR::Value{u32{0}}); + } +} + +void FoldGetPred(IR::Inst& inst) { + if (inst.Arg(0).Pred() == IR::Pred::PT) { + inst.ReplaceUsesWith(IR::Value{true}); + } +} + +/// Replaces the pattern generated by two XMAD multiplications +bool FoldXmadMultiply(IR::Block& block, IR::Inst& inst) { + /* + * We are looking for this pattern: + * %rhs_bfe = BitFieldUExtract %factor_a, #0, #16 + * %rhs_mul = IMul32 %rhs_bfe, %factor_b + * %lhs_bfe = BitFieldUExtract %factor_a, #16, #16 + * %rhs_mul = IMul32 %lhs_bfe, %factor_b + * %lhs_shl = ShiftLeftLogical32 %rhs_mul, #16 + * %result = IAdd32 %lhs_shl, %rhs_mul + * + * And replacing it with + * %result = IMul32 %factor_a, %factor_b + * + * This optimization has been proven safe by LLVM and MSVC. + */ + const IR::Value lhs_arg{inst.Arg(0)}; + const IR::Value rhs_arg{inst.Arg(1)}; + if (lhs_arg.IsImmediate() || rhs_arg.IsImmediate()) { + return false; + } + IR::Inst* const lhs_shl{lhs_arg.InstRecursive()}; + if (lhs_shl->GetOpcode() != IR::Opcode::ShiftLeftLogical32 || + lhs_shl->Arg(1) != IR::Value{16U}) { + return false; + } + if (lhs_shl->Arg(0).IsImmediate()) { + return false; + } + IR::Inst* const lhs_mul{lhs_shl->Arg(0).InstRecursive()}; + IR::Inst* const rhs_mul{rhs_arg.InstRecursive()}; + if (lhs_mul->GetOpcode() != IR::Opcode::IMul32 || rhs_mul->GetOpcode() != IR::Opcode::IMul32) { + return false; + } + if (lhs_mul->Arg(1).Resolve() != rhs_mul->Arg(1).Resolve()) { + return false; + } + const IR::U32 factor_b{lhs_mul->Arg(1)}; + if (lhs_mul->Arg(0).IsImmediate() || rhs_mul->Arg(0).IsImmediate()) { + return false; + } + IR::Inst* const lhs_bfe{lhs_mul->Arg(0).InstRecursive()}; + IR::Inst* const rhs_bfe{rhs_mul->Arg(0).InstRecursive()}; + if (lhs_bfe->GetOpcode() != IR::Opcode::BitFieldUExtract) { + return false; + } + if (rhs_bfe->GetOpcode() != IR::Opcode::BitFieldUExtract) { + return false; + } + if (lhs_bfe->Arg(1) != IR::Value{16U} || lhs_bfe->Arg(2) != IR::Value{16U}) { + return false; + } + if (rhs_bfe->Arg(1) != IR::Value{0U} || rhs_bfe->Arg(2) != IR::Value{16U}) { + return false; + } + if (lhs_bfe->Arg(0).Resolve() != rhs_bfe->Arg(0).Resolve()) { + return false; + } + const IR::U32 factor_a{lhs_bfe->Arg(0)}; + IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)}; + inst.ReplaceUsesWith(ir.IMul(factor_a, factor_b)); + return true; +} + +template <typename T> +void FoldAdd(IR::Block& block, IR::Inst& inst) { + if (inst.HasAssociatedPseudoOperation()) { + return; + } + if (!FoldCommutative<T>(inst, [](T a, T b) { return a + b; })) { + return; + } + const IR::Value rhs{inst.Arg(1)}; + if (rhs.IsImmediate() && Arg<T>(rhs) == 0) { + inst.ReplaceUsesWith(inst.Arg(0)); + return; + } + if constexpr (std::is_same_v<T, u32>) { + if (FoldXmadMultiply(block, inst)) { + return; + } + } +} + +void FoldISub32(IR::Inst& inst) { + if (FoldWhenAllImmediates(inst, [](u32 a, u32 b) { return a - b; })) { + return; + } + if (inst.Arg(0).IsImmediate() || inst.Arg(1).IsImmediate()) { + return; + } + // ISub32 is generally used to subtract two constant buffers, compare and replace this with + // zero if they equal. + const auto equal_cbuf{[](IR::Inst* a, IR::Inst* b) { + return a->GetOpcode() == IR::Opcode::GetCbufU32 && + b->GetOpcode() == IR::Opcode::GetCbufU32 && a->Arg(0) == b->Arg(0) && + a->Arg(1) == b->Arg(1); + }}; + IR::Inst* op_a{inst.Arg(0).InstRecursive()}; + IR::Inst* op_b{inst.Arg(1).InstRecursive()}; + if (equal_cbuf(op_a, op_b)) { + inst.ReplaceUsesWith(IR::Value{u32{0}}); + return; + } + // It's also possible a value is being added to a cbuf and then subtracted + if (op_b->GetOpcode() == IR::Opcode::IAdd32) { + // Canonicalize local variables to simplify the following logic + std::swap(op_a, op_b); + } + if (op_b->GetOpcode() != IR::Opcode::GetCbufU32) { + return; + } + IR::Inst* const inst_cbuf{op_b}; + if (op_a->GetOpcode() != IR::Opcode::IAdd32) { + return; + } + IR::Value add_op_a{op_a->Arg(0)}; + IR::Value add_op_b{op_a->Arg(1)}; + if (add_op_b.IsImmediate()) { + // Canonicalize + std::swap(add_op_a, add_op_b); + } + if (add_op_b.IsImmediate()) { + return; + } + IR::Inst* const add_cbuf{add_op_b.InstRecursive()}; + if (equal_cbuf(add_cbuf, inst_cbuf)) { + inst.ReplaceUsesWith(add_op_a); + } +} + +void FoldSelect(IR::Inst& inst) { + const IR::Value cond{inst.Arg(0)}; + if (cond.IsImmediate()) { + inst.ReplaceUsesWith(cond.U1() ? inst.Arg(1) : inst.Arg(2)); + } +} + +void FoldFPMul32(IR::Inst& inst) { + const auto control{inst.Flags<IR::FpControl>()}; + if (control.no_contraction) { + return; + } + // Fold interpolation operations + const IR::Value lhs_value{inst.Arg(0)}; + const IR::Value rhs_value{inst.Arg(1)}; + if (lhs_value.IsImmediate() || rhs_value.IsImmediate()) { + return; + } + IR::Inst* const lhs_op{lhs_value.InstRecursive()}; + IR::Inst* const rhs_op{rhs_value.InstRecursive()}; + if (lhs_op->GetOpcode() != IR::Opcode::FPMul32 || + rhs_op->GetOpcode() != IR::Opcode::FPRecip32) { + return; + } + const IR::Value recip_source{rhs_op->Arg(0)}; + const IR::Value lhs_mul_source{lhs_op->Arg(1).Resolve()}; + if (recip_source.IsImmediate() || lhs_mul_source.IsImmediate()) { + return; + } + IR::Inst* const attr_a{recip_source.InstRecursive()}; + IR::Inst* const attr_b{lhs_mul_source.InstRecursive()}; + if (attr_a->GetOpcode() != IR::Opcode::GetAttribute || + attr_b->GetOpcode() != IR::Opcode::GetAttribute) { + return; + } + if (attr_a->Arg(0).Attribute() == attr_b->Arg(0).Attribute()) { + inst.ReplaceUsesWith(lhs_op->Arg(0)); + } +} + +void FoldLogicalAnd(IR::Inst& inst) { + if (!FoldCommutative<bool>(inst, [](bool a, bool b) { return a && b; })) { + return; + } + const IR::Value rhs{inst.Arg(1)}; + if (rhs.IsImmediate()) { + if (rhs.U1()) { + inst.ReplaceUsesWith(inst.Arg(0)); + } else { + inst.ReplaceUsesWith(IR::Value{false}); + } + } +} + +void FoldLogicalOr(IR::Inst& inst) { + if (!FoldCommutative<bool>(inst, [](bool a, bool b) { return a || b; })) { + return; + } + const IR::Value rhs{inst.Arg(1)}; + if (rhs.IsImmediate()) { + if (rhs.U1()) { + inst.ReplaceUsesWith(IR::Value{true}); + } else { + inst.ReplaceUsesWith(inst.Arg(0)); + } + } +} + +void FoldLogicalNot(IR::Inst& inst) { + const IR::U1 value{inst.Arg(0)}; + if (value.IsImmediate()) { + inst.ReplaceUsesWith(IR::Value{!value.U1()}); + return; + } + IR::Inst* const arg{value.InstRecursive()}; + if (arg->GetOpcode() == IR::Opcode::LogicalNot) { + inst.ReplaceUsesWith(arg->Arg(0)); + } +} + +template <IR::Opcode op, typename Dest, typename Source> +void FoldBitCast(IR::Inst& inst, IR::Opcode reverse) { + const IR::Value value{inst.Arg(0)}; + if (value.IsImmediate()) { + inst.ReplaceUsesWith(IR::Value{Common::BitCast<Dest>(Arg<Source>(value))}); + return; + } + IR::Inst* const arg_inst{value.InstRecursive()}; + if (arg_inst->GetOpcode() == reverse) { + inst.ReplaceUsesWith(arg_inst->Arg(0)); + return; + } + if constexpr (op == IR::Opcode::BitCastF32U32) { + if (arg_inst->GetOpcode() == IR::Opcode::GetCbufU32) { + // Replace the bitcast with a typed constant buffer read + inst.ReplaceOpcode(IR::Opcode::GetCbufF32); + inst.SetArg(0, arg_inst->Arg(0)); + inst.SetArg(1, arg_inst->Arg(1)); + return; + } + } +} + +void FoldInverseFunc(IR::Inst& inst, IR::Opcode reverse) { + const IR::Value value{inst.Arg(0)}; + if (value.IsImmediate()) { + return; + } + IR::Inst* const arg_inst{value.InstRecursive()}; + if (arg_inst->GetOpcode() == reverse) { + inst.ReplaceUsesWith(arg_inst->Arg(0)); + return; + } +} + +template <typename Func, size_t... I> +IR::Value EvalImmediates(const IR::Inst& inst, Func&& func, std::index_sequence<I...>) { + using Traits = LambdaTraits<decltype(func)>; + return IR::Value{func(Arg<typename Traits::template ArgType<I>>(inst.Arg(I))...)}; +} + +std::optional<IR::Value> FoldCompositeExtractImpl(IR::Value inst_value, IR::Opcode insert, + IR::Opcode construct, u32 first_index) { + IR::Inst* const inst{inst_value.InstRecursive()}; + if (inst->GetOpcode() == construct) { + return inst->Arg(first_index); + } + if (inst->GetOpcode() != insert) { + return std::nullopt; + } + IR::Value value_index{inst->Arg(2)}; + if (!value_index.IsImmediate()) { + return std::nullopt; + } + const u32 second_index{value_index.U32()}; + if (first_index != second_index) { + IR::Value value_composite{inst->Arg(0)}; + if (value_composite.IsImmediate()) { + return std::nullopt; + } + return FoldCompositeExtractImpl(value_composite, insert, construct, first_index); + } + return inst->Arg(1); +} + +void FoldCompositeExtract(IR::Inst& inst, IR::Opcode construct, IR::Opcode insert) { + const IR::Value value_1{inst.Arg(0)}; + const IR::Value value_2{inst.Arg(1)}; + if (value_1.IsImmediate()) { + return; + } + if (!value_2.IsImmediate()) { + return; + } + const u32 first_index{value_2.U32()}; + const std::optional result{FoldCompositeExtractImpl(value_1, insert, construct, first_index)}; + if (!result) { + return; + } + inst.ReplaceUsesWith(*result); +} + +IR::Value GetThroughCast(IR::Value value, IR::Opcode expected_cast) { + if (value.IsImmediate()) { + return value; + } + IR::Inst* const inst{value.InstRecursive()}; + if (inst->GetOpcode() == expected_cast) { + return inst->Arg(0).Resolve(); + } + return value; +} + +void FoldFSwizzleAdd(IR::Block& block, IR::Inst& inst) { + const IR::Value swizzle{inst.Arg(2)}; + if (!swizzle.IsImmediate()) { + return; + } + const IR::Value value_1{GetThroughCast(inst.Arg(0).Resolve(), IR::Opcode::BitCastF32U32)}; + const IR::Value value_2{GetThroughCast(inst.Arg(1).Resolve(), IR::Opcode::BitCastF32U32)}; + if (value_1.IsImmediate()) { + return; + } + const u32 swizzle_value{swizzle.U32()}; + if (swizzle_value != 0x99 && swizzle_value != 0xA5) { + return; + } + IR::Inst* const inst2{value_1.InstRecursive()}; + if (inst2->GetOpcode() != IR::Opcode::ShuffleButterfly) { + return; + } + const IR::Value value_3{GetThroughCast(inst2->Arg(0).Resolve(), IR::Opcode::BitCastU32F32)}; + if (value_2 != value_3) { + return; + } + const IR::Value index{inst2->Arg(1)}; + const IR::Value clamp{inst2->Arg(2)}; + const IR::Value segmentation_mask{inst2->Arg(3)}; + if (!index.IsImmediate() || !clamp.IsImmediate() || !segmentation_mask.IsImmediate()) { + return; + } + if (clamp.U32() != 3 || segmentation_mask.U32() != 28) { + return; + } + if (swizzle_value == 0x99) { + // DPdxFine + if (index.U32() == 1) { + IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)}; + inst.ReplaceUsesWith(ir.DPdxFine(IR::F32{inst.Arg(1)})); + } + } else if (swizzle_value == 0xA5) { + // DPdyFine + if (index.U32() == 2) { + IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)}; + inst.ReplaceUsesWith(ir.DPdyFine(IR::F32{inst.Arg(1)})); + } + } +} + +void ConstantPropagation(IR::Block& block, IR::Inst& inst) { + switch (inst.GetOpcode()) { + case IR::Opcode::GetRegister: + return FoldGetRegister(inst); + case IR::Opcode::GetPred: + return FoldGetPred(inst); + case IR::Opcode::IAdd32: + return FoldAdd<u32>(block, inst); + case IR::Opcode::ISub32: + return FoldISub32(inst); + case IR::Opcode::IMul32: + FoldWhenAllImmediates(inst, [](u32 a, u32 b) { return a * b; }); + return; + case IR::Opcode::ShiftRightArithmetic32: + FoldWhenAllImmediates(inst, [](s32 a, s32 b) { return static_cast<u32>(a >> b); }); + return; + case IR::Opcode::BitCastF32U32: + return FoldBitCast<IR::Opcode::BitCastF32U32, f32, u32>(inst, IR::Opcode::BitCastU32F32); + case IR::Opcode::BitCastU32F32: + return FoldBitCast<IR::Opcode::BitCastU32F32, u32, f32>(inst, IR::Opcode::BitCastF32U32); + case IR::Opcode::IAdd64: + return FoldAdd<u64>(block, inst); + case IR::Opcode::PackHalf2x16: + return FoldInverseFunc(inst, IR::Opcode::UnpackHalf2x16); + case IR::Opcode::UnpackHalf2x16: + return FoldInverseFunc(inst, IR::Opcode::PackHalf2x16); + case IR::Opcode::SelectU1: + case IR::Opcode::SelectU8: + case IR::Opcode::SelectU16: + case IR::Opcode::SelectU32: + case IR::Opcode::SelectU64: + case IR::Opcode::SelectF16: + case IR::Opcode::SelectF32: + case IR::Opcode::SelectF64: + return FoldSelect(inst); + case IR::Opcode::FPMul32: + return FoldFPMul32(inst); + case IR::Opcode::LogicalAnd: + return FoldLogicalAnd(inst); + case IR::Opcode::LogicalOr: + return FoldLogicalOr(inst); + case IR::Opcode::LogicalNot: + return FoldLogicalNot(inst); + case IR::Opcode::SLessThan: + FoldWhenAllImmediates(inst, [](s32 a, s32 b) { return a < b; }); + return; + case IR::Opcode::ULessThan: + FoldWhenAllImmediates(inst, [](u32 a, u32 b) { return a < b; }); + return; + case IR::Opcode::SLessThanEqual: + FoldWhenAllImmediates(inst, [](s32 a, s32 b) { return a <= b; }); + return; + case IR::Opcode::ULessThanEqual: + FoldWhenAllImmediates(inst, [](u32 a, u32 b) { return a <= b; }); + return; + case IR::Opcode::SGreaterThan: + FoldWhenAllImmediates(inst, [](s32 a, s32 b) { return a > b; }); + return; + case IR::Opcode::UGreaterThan: + FoldWhenAllImmediates(inst, [](u32 a, u32 b) { return a > b; }); + return; + case IR::Opcode::SGreaterThanEqual: + FoldWhenAllImmediates(inst, [](s32 a, s32 b) { return a >= b; }); + return; + case IR::Opcode::UGreaterThanEqual: + FoldWhenAllImmediates(inst, [](u32 a, u32 b) { return a >= b; }); + return; + case IR::Opcode::IEqual: + FoldWhenAllImmediates(inst, [](u32 a, u32 b) { return a == b; }); + return; + case IR::Opcode::INotEqual: + FoldWhenAllImmediates(inst, [](u32 a, u32 b) { return a != b; }); + return; + case IR::Opcode::BitwiseAnd32: + FoldWhenAllImmediates(inst, [](u32 a, u32 b) { return a & b; }); + return; + case IR::Opcode::BitwiseOr32: + FoldWhenAllImmediates(inst, [](u32 a, u32 b) { return a | b; }); + return; + case IR::Opcode::BitwiseXor32: + FoldWhenAllImmediates(inst, [](u32 a, u32 b) { return a ^ b; }); + return; + case IR::Opcode::BitFieldUExtract: + FoldWhenAllImmediates(inst, [](u32 base, u32 shift, u32 count) { + if (static_cast<size_t>(shift) + static_cast<size_t>(count) > 32) { + throw LogicError("Undefined result in {}({}, {}, {})", IR::Opcode::BitFieldUExtract, + base, shift, count); + } + return (base >> shift) & ((1U << count) - 1); + }); + return; + case IR::Opcode::BitFieldSExtract: + FoldWhenAllImmediates(inst, [](s32 base, u32 shift, u32 count) { + const size_t back_shift{static_cast<size_t>(shift) + static_cast<size_t>(count)}; + const size_t left_shift{32 - back_shift}; + const size_t right_shift{static_cast<size_t>(32 - count)}; + if (back_shift > 32 || left_shift >= 32 || right_shift >= 32) { + throw LogicError("Undefined result in {}({}, {}, {})", IR::Opcode::BitFieldSExtract, + base, shift, count); + } + return static_cast<u32>((base << left_shift) >> right_shift); + }); + return; + case IR::Opcode::BitFieldInsert: + FoldWhenAllImmediates(inst, [](u32 base, u32 insert, u32 offset, u32 bits) { + if (bits >= 32 || offset >= 32) { + throw LogicError("Undefined result in {}({}, {}, {}, {})", + IR::Opcode::BitFieldInsert, base, insert, offset, bits); + } + return (base & ~(~(~0u << bits) << offset)) | (insert << offset); + }); + return; + case IR::Opcode::CompositeExtractU32x2: + return FoldCompositeExtract(inst, IR::Opcode::CompositeConstructU32x2, + IR::Opcode::CompositeInsertU32x2); + case IR::Opcode::CompositeExtractU32x3: + return FoldCompositeExtract(inst, IR::Opcode::CompositeConstructU32x3, + IR::Opcode::CompositeInsertU32x3); + case IR::Opcode::CompositeExtractU32x4: + return FoldCompositeExtract(inst, IR::Opcode::CompositeConstructU32x4, + IR::Opcode::CompositeInsertU32x4); + case IR::Opcode::CompositeExtractF32x2: + return FoldCompositeExtract(inst, IR::Opcode::CompositeConstructF32x2, + IR::Opcode::CompositeInsertF32x2); + case IR::Opcode::CompositeExtractF32x3: + return FoldCompositeExtract(inst, IR::Opcode::CompositeConstructF32x3, + IR::Opcode::CompositeInsertF32x3); + case IR::Opcode::CompositeExtractF32x4: + return FoldCompositeExtract(inst, IR::Opcode::CompositeConstructF32x4, + IR::Opcode::CompositeInsertF32x4); + case IR::Opcode::CompositeExtractF16x2: + return FoldCompositeExtract(inst, IR::Opcode::CompositeConstructF16x2, + IR::Opcode::CompositeInsertF16x2); + case IR::Opcode::CompositeExtractF16x3: + return FoldCompositeExtract(inst, IR::Opcode::CompositeConstructF16x3, + IR::Opcode::CompositeInsertF16x3); + case IR::Opcode::CompositeExtractF16x4: + return FoldCompositeExtract(inst, IR::Opcode::CompositeConstructF16x4, + IR::Opcode::CompositeInsertF16x4); + case IR::Opcode::FSwizzleAdd: + return FoldFSwizzleAdd(block, inst); + default: + break; + } +} +} // Anonymous namespace + +void ConstantPropagationPass(IR::Program& program) { + const auto end{program.post_order_blocks.rend()}; + for (auto it = program.post_order_blocks.rbegin(); it != end; ++it) { + IR::Block* const block{*it}; + for (IR::Inst& inst : block->Instructions()) { + ConstantPropagation(*block, inst); + } + } +} + +} // namespace Shader::Optimization diff --git a/src/shader_recompiler/ir_opt/dead_code_elimination_pass.cpp b/src/shader_recompiler/ir_opt/dead_code_elimination_pass.cpp new file mode 100644 index 000000000..400836301 --- /dev/null +++ b/src/shader_recompiler/ir_opt/dead_code_elimination_pass.cpp @@ -0,0 +1,26 @@ +// Copyright 2021 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include "shader_recompiler/frontend/ir/basic_block.h" +#include "shader_recompiler/frontend/ir/value.h" +#include "shader_recompiler/ir_opt/passes.h" + +namespace Shader::Optimization { + +void DeadCodeEliminationPass(IR::Program& program) { + // We iterate over the instructions in reverse order. + // This is because removing an instruction reduces the number of uses for earlier instructions. + for (IR::Block* const block : program.post_order_blocks) { + auto it{block->end()}; + while (it != block->begin()) { + --it; + if (!it->HasUses() && !it->MayHaveSideEffects()) { + it->Invalidate(); + it = block->Instructions().erase(it); + } + } + } +} + +} // namespace Shader::Optimization diff --git a/src/shader_recompiler/ir_opt/dual_vertex_pass.cpp b/src/shader_recompiler/ir_opt/dual_vertex_pass.cpp new file mode 100644 index 000000000..055ba9c54 --- /dev/null +++ b/src/shader_recompiler/ir_opt/dual_vertex_pass.cpp @@ -0,0 +1,30 @@ +// Copyright 2021 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include "shader_recompiler/frontend/ir/ir_emitter.h" +#include "shader_recompiler/ir_opt/passes.h" + +namespace Shader::Optimization { + +void VertexATransformPass(IR::Program& program) { + for (IR::Block* const block : program.blocks) { + for (IR::Inst& inst : block->Instructions()) { + if (inst.GetOpcode() == IR::Opcode::Epilogue) { + return inst.Invalidate(); + } + } + } +} + +void VertexBTransformPass(IR::Program& program) { + for (IR::Block* const block : program.blocks) { + for (IR::Inst& inst : block->Instructions()) { + if (inst.GetOpcode() == IR::Opcode::Prologue) { + return inst.Invalidate(); + } + } + } +} + +} // namespace Shader::Optimization diff --git a/src/shader_recompiler/ir_opt/global_memory_to_storage_buffer_pass.cpp b/src/shader_recompiler/ir_opt/global_memory_to_storage_buffer_pass.cpp new file mode 100644 index 000000000..4197b0095 --- /dev/null +++ b/src/shader_recompiler/ir_opt/global_memory_to_storage_buffer_pass.cpp @@ -0,0 +1,526 @@ +// Copyright 2021 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <algorithm> +#include <compare> +#include <optional> +#include <queue> + +#include <boost/container/flat_set.hpp> +#include <boost/container/small_vector.hpp> + +#include "common/alignment.h" +#include "shader_recompiler/frontend/ir/basic_block.h" +#include "shader_recompiler/frontend/ir/breadth_first_search.h" +#include "shader_recompiler/frontend/ir/ir_emitter.h" +#include "shader_recompiler/frontend/ir/value.h" +#include "shader_recompiler/ir_opt/passes.h" + +namespace Shader::Optimization { +namespace { +/// Address in constant buffers to the storage buffer descriptor +struct StorageBufferAddr { + auto operator<=>(const StorageBufferAddr&) const noexcept = default; + + u32 index; + u32 offset; +}; + +/// Block iterator to a global memory instruction and the storage buffer it uses +struct StorageInst { + StorageBufferAddr storage_buffer; + IR::Inst* inst; + IR::Block* block; +}; + +/// Bias towards a certain range of constant buffers when looking for storage buffers +struct Bias { + u32 index; + u32 offset_begin; + u32 offset_end; +}; + +using boost::container::flat_set; +using boost::container::small_vector; +using StorageBufferSet = + flat_set<StorageBufferAddr, std::less<StorageBufferAddr>, small_vector<StorageBufferAddr, 16>>; +using StorageInstVector = small_vector<StorageInst, 24>; +using StorageWritesSet = + flat_set<StorageBufferAddr, std::less<StorageBufferAddr>, small_vector<StorageBufferAddr, 16>>; + +struct StorageInfo { + StorageBufferSet set; + StorageInstVector to_replace; + StorageWritesSet writes; +}; + +/// Returns true when the instruction is a global memory instruction +bool IsGlobalMemory(const IR::Inst& inst) { + switch (inst.GetOpcode()) { + case IR::Opcode::LoadGlobalS8: + case IR::Opcode::LoadGlobalU8: + case IR::Opcode::LoadGlobalS16: + case IR::Opcode::LoadGlobalU16: + case IR::Opcode::LoadGlobal32: + case IR::Opcode::LoadGlobal64: + case IR::Opcode::LoadGlobal128: + case IR::Opcode::WriteGlobalS8: + case IR::Opcode::WriteGlobalU8: + case IR::Opcode::WriteGlobalS16: + case IR::Opcode::WriteGlobalU16: + case IR::Opcode::WriteGlobal32: + case IR::Opcode::WriteGlobal64: + case IR::Opcode::WriteGlobal128: + case IR::Opcode::GlobalAtomicIAdd32: + case IR::Opcode::GlobalAtomicSMin32: + case IR::Opcode::GlobalAtomicUMin32: + case IR::Opcode::GlobalAtomicSMax32: + case IR::Opcode::GlobalAtomicUMax32: + case IR::Opcode::GlobalAtomicInc32: + case IR::Opcode::GlobalAtomicDec32: + case IR::Opcode::GlobalAtomicAnd32: + case IR::Opcode::GlobalAtomicOr32: + case IR::Opcode::GlobalAtomicXor32: + case IR::Opcode::GlobalAtomicExchange32: + case IR::Opcode::GlobalAtomicIAdd64: + case IR::Opcode::GlobalAtomicSMin64: + case IR::Opcode::GlobalAtomicUMin64: + case IR::Opcode::GlobalAtomicSMax64: + case IR::Opcode::GlobalAtomicUMax64: + case IR::Opcode::GlobalAtomicAnd64: + case IR::Opcode::GlobalAtomicOr64: + case IR::Opcode::GlobalAtomicXor64: + case IR::Opcode::GlobalAtomicExchange64: + case IR::Opcode::GlobalAtomicAddF32: + case IR::Opcode::GlobalAtomicAddF16x2: + case IR::Opcode::GlobalAtomicAddF32x2: + case IR::Opcode::GlobalAtomicMinF16x2: + case IR::Opcode::GlobalAtomicMinF32x2: + case IR::Opcode::GlobalAtomicMaxF16x2: + case IR::Opcode::GlobalAtomicMaxF32x2: + return true; + default: + return false; + } +} + +/// Returns true when the instruction is a global memory instruction +bool IsGlobalMemoryWrite(const IR::Inst& inst) { + switch (inst.GetOpcode()) { + case IR::Opcode::WriteGlobalS8: + case IR::Opcode::WriteGlobalU8: + case IR::Opcode::WriteGlobalS16: + case IR::Opcode::WriteGlobalU16: + case IR::Opcode::WriteGlobal32: + case IR::Opcode::WriteGlobal64: + case IR::Opcode::WriteGlobal128: + case IR::Opcode::GlobalAtomicIAdd32: + case IR::Opcode::GlobalAtomicSMin32: + case IR::Opcode::GlobalAtomicUMin32: + case IR::Opcode::GlobalAtomicSMax32: + case IR::Opcode::GlobalAtomicUMax32: + case IR::Opcode::GlobalAtomicInc32: + case IR::Opcode::GlobalAtomicDec32: + case IR::Opcode::GlobalAtomicAnd32: + case IR::Opcode::GlobalAtomicOr32: + case IR::Opcode::GlobalAtomicXor32: + case IR::Opcode::GlobalAtomicExchange32: + case IR::Opcode::GlobalAtomicIAdd64: + case IR::Opcode::GlobalAtomicSMin64: + case IR::Opcode::GlobalAtomicUMin64: + case IR::Opcode::GlobalAtomicSMax64: + case IR::Opcode::GlobalAtomicUMax64: + case IR::Opcode::GlobalAtomicAnd64: + case IR::Opcode::GlobalAtomicOr64: + case IR::Opcode::GlobalAtomicXor64: + case IR::Opcode::GlobalAtomicExchange64: + case IR::Opcode::GlobalAtomicAddF32: + case IR::Opcode::GlobalAtomicAddF16x2: + case IR::Opcode::GlobalAtomicAddF32x2: + case IR::Opcode::GlobalAtomicMinF16x2: + case IR::Opcode::GlobalAtomicMinF32x2: + case IR::Opcode::GlobalAtomicMaxF16x2: + case IR::Opcode::GlobalAtomicMaxF32x2: + return true; + default: + return false; + } +} + +/// Converts a global memory opcode to its storage buffer equivalent +IR::Opcode GlobalToStorage(IR::Opcode opcode) { + switch (opcode) { + case IR::Opcode::LoadGlobalS8: + return IR::Opcode::LoadStorageS8; + case IR::Opcode::LoadGlobalU8: + return IR::Opcode::LoadStorageU8; + case IR::Opcode::LoadGlobalS16: + return IR::Opcode::LoadStorageS16; + case IR::Opcode::LoadGlobalU16: + return IR::Opcode::LoadStorageU16; + case IR::Opcode::LoadGlobal32: + return IR::Opcode::LoadStorage32; + case IR::Opcode::LoadGlobal64: + return IR::Opcode::LoadStorage64; + case IR::Opcode::LoadGlobal128: + return IR::Opcode::LoadStorage128; + case IR::Opcode::WriteGlobalS8: + return IR::Opcode::WriteStorageS8; + case IR::Opcode::WriteGlobalU8: + return IR::Opcode::WriteStorageU8; + case IR::Opcode::WriteGlobalS16: + return IR::Opcode::WriteStorageS16; + case IR::Opcode::WriteGlobalU16: + return IR::Opcode::WriteStorageU16; + case IR::Opcode::WriteGlobal32: + return IR::Opcode::WriteStorage32; + case IR::Opcode::WriteGlobal64: + return IR::Opcode::WriteStorage64; + case IR::Opcode::WriteGlobal128: + return IR::Opcode::WriteStorage128; + case IR::Opcode::GlobalAtomicIAdd32: + return IR::Opcode::StorageAtomicIAdd32; + case IR::Opcode::GlobalAtomicSMin32: + return IR::Opcode::StorageAtomicSMin32; + case IR::Opcode::GlobalAtomicUMin32: + return IR::Opcode::StorageAtomicUMin32; + case IR::Opcode::GlobalAtomicSMax32: + return IR::Opcode::StorageAtomicSMax32; + case IR::Opcode::GlobalAtomicUMax32: + return IR::Opcode::StorageAtomicUMax32; + case IR::Opcode::GlobalAtomicInc32: + return IR::Opcode::StorageAtomicInc32; + case IR::Opcode::GlobalAtomicDec32: + return IR::Opcode::StorageAtomicDec32; + case IR::Opcode::GlobalAtomicAnd32: + return IR::Opcode::StorageAtomicAnd32; + case IR::Opcode::GlobalAtomicOr32: + return IR::Opcode::StorageAtomicOr32; + case IR::Opcode::GlobalAtomicXor32: + return IR::Opcode::StorageAtomicXor32; + case IR::Opcode::GlobalAtomicIAdd64: + return IR::Opcode::StorageAtomicIAdd64; + case IR::Opcode::GlobalAtomicSMin64: + return IR::Opcode::StorageAtomicSMin64; + case IR::Opcode::GlobalAtomicUMin64: + return IR::Opcode::StorageAtomicUMin64; + case IR::Opcode::GlobalAtomicSMax64: + return IR::Opcode::StorageAtomicSMax64; + case IR::Opcode::GlobalAtomicUMax64: + return IR::Opcode::StorageAtomicUMax64; + case IR::Opcode::GlobalAtomicAnd64: + return IR::Opcode::StorageAtomicAnd64; + case IR::Opcode::GlobalAtomicOr64: + return IR::Opcode::StorageAtomicOr64; + case IR::Opcode::GlobalAtomicXor64: + return IR::Opcode::StorageAtomicXor64; + case IR::Opcode::GlobalAtomicExchange32: + return IR::Opcode::StorageAtomicExchange32; + case IR::Opcode::GlobalAtomicExchange64: + return IR::Opcode::StorageAtomicExchange64; + case IR::Opcode::GlobalAtomicAddF32: + return IR::Opcode::StorageAtomicAddF32; + case IR::Opcode::GlobalAtomicAddF16x2: + return IR::Opcode::StorageAtomicAddF16x2; + case IR::Opcode::GlobalAtomicMinF16x2: + return IR::Opcode::StorageAtomicMinF16x2; + case IR::Opcode::GlobalAtomicMaxF16x2: + return IR::Opcode::StorageAtomicMaxF16x2; + case IR::Opcode::GlobalAtomicAddF32x2: + return IR::Opcode::StorageAtomicAddF32x2; + case IR::Opcode::GlobalAtomicMinF32x2: + return IR::Opcode::StorageAtomicMinF32x2; + case IR::Opcode::GlobalAtomicMaxF32x2: + return IR::Opcode::StorageAtomicMaxF32x2; + default: + throw InvalidArgument("Invalid global memory opcode {}", opcode); + } +} + +/// Returns true when a storage buffer address satisfies a bias +bool MeetsBias(const StorageBufferAddr& storage_buffer, const Bias& bias) noexcept { + return storage_buffer.index == bias.index && storage_buffer.offset >= bias.offset_begin && + storage_buffer.offset < bias.offset_end; +} + +struct LowAddrInfo { + IR::U32 value; + s32 imm_offset; +}; + +/// Tries to track the first 32-bits of a global memory instruction +std::optional<LowAddrInfo> TrackLowAddress(IR::Inst* inst) { + // The first argument is the low level GPU pointer to the global memory instruction + const IR::Value addr{inst->Arg(0)}; + if (addr.IsImmediate()) { + // Not much we can do if it's an immediate + return std::nullopt; + } + // This address is expected to either be a PackUint2x32, a IAdd64, or a CompositeConstructU32x2 + IR::Inst* addr_inst{addr.InstRecursive()}; + s32 imm_offset{0}; + if (addr_inst->GetOpcode() == IR::Opcode::IAdd64) { + // If it's an IAdd64, get the immediate offset it is applying and grab the address + // instruction. This expects for the instruction to be canonicalized having the address on + // the first argument and the immediate offset on the second one. + const IR::U64 imm_offset_value{addr_inst->Arg(1)}; + if (!imm_offset_value.IsImmediate()) { + return std::nullopt; + } + imm_offset = static_cast<s32>(static_cast<s64>(imm_offset_value.U64())); + const IR::U64 iadd_addr{addr_inst->Arg(0)}; + if (iadd_addr.IsImmediate()) { + return std::nullopt; + } + addr_inst = iadd_addr.InstRecursive(); + } + // With IAdd64 handled, now PackUint2x32 is expected + if (addr_inst->GetOpcode() == IR::Opcode::PackUint2x32) { + // PackUint2x32 is expected to be generated from a vector + const IR::Value vector{addr_inst->Arg(0)}; + if (vector.IsImmediate()) { + return std::nullopt; + } + addr_inst = vector.InstRecursive(); + } + // The vector is expected to be a CompositeConstructU32x2 + if (addr_inst->GetOpcode() != IR::Opcode::CompositeConstructU32x2) { + return std::nullopt; + } + // Grab the first argument from the CompositeConstructU32x2, this is the low address. + return LowAddrInfo{ + .value{IR::U32{addr_inst->Arg(0)}}, + .imm_offset = imm_offset, + }; +} + +/// Tries to track the storage buffer address used by a global memory instruction +std::optional<StorageBufferAddr> Track(const IR::Value& value, const Bias* bias) { + const auto pred{[bias](const IR::Inst* inst) -> std::optional<StorageBufferAddr> { + if (inst->GetOpcode() != IR::Opcode::GetCbufU32) { + return std::nullopt; + } + const IR::Value index{inst->Arg(0)}; + const IR::Value offset{inst->Arg(1)}; + if (!index.IsImmediate()) { + // Definitely not a storage buffer if it's read from a + // non-immediate index + return std::nullopt; + } + if (!offset.IsImmediate()) { + // TODO: Support SSBO arrays + return std::nullopt; + } + const StorageBufferAddr storage_buffer{ + .index = index.U32(), + .offset = offset.U32(), + }; + if (!Common::IsAligned(storage_buffer.offset, 16)) { + // The SSBO pointer has to be aligned + return std::nullopt; + } + if (bias && !MeetsBias(storage_buffer, *bias)) { + // We have to blacklist some addresses in case we wrongly + // point to them + return std::nullopt; + } + return storage_buffer; + }}; + return BreadthFirstSearch(value, pred); +} + +/// Collects the storage buffer used by a global memory instruction and the instruction itself +void CollectStorageBuffers(IR::Block& block, IR::Inst& inst, StorageInfo& info) { + // NVN puts storage buffers in a specific range, we have to bias towards these addresses to + // avoid getting false positives + static constexpr Bias nvn_bias{ + .index = 0, + .offset_begin = 0x110, + .offset_end = 0x610, + }; + // Track the low address of the instruction + const std::optional<LowAddrInfo> low_addr_info{TrackLowAddress(&inst)}; + if (!low_addr_info) { + // Failed to track the low address, use NVN fallbacks + return; + } + // First try to find storage buffers in the NVN address + const IR::U32 low_addr{low_addr_info->value}; + std::optional<StorageBufferAddr> storage_buffer{Track(low_addr, &nvn_bias)}; + if (!storage_buffer) { + // If it fails, track without a bias + storage_buffer = Track(low_addr, nullptr); + if (!storage_buffer) { + // If that also fails, use NVN fallbacks + return; + } + } + // Collect storage buffer and the instruction + if (IsGlobalMemoryWrite(inst)) { + info.writes.insert(*storage_buffer); + } + info.set.insert(*storage_buffer); + info.to_replace.push_back(StorageInst{ + .storage_buffer{*storage_buffer}, + .inst = &inst, + .block = &block, + }); +} + +/// Returns the offset in indices (not bytes) for an equivalent storage instruction +IR::U32 StorageOffset(IR::Block& block, IR::Inst& inst, StorageBufferAddr buffer) { + IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)}; + IR::U32 offset; + if (const std::optional<LowAddrInfo> low_addr{TrackLowAddress(&inst)}) { + offset = low_addr->value; + if (low_addr->imm_offset != 0) { + offset = ir.IAdd(offset, ir.Imm32(low_addr->imm_offset)); + } + } else { + offset = ir.UConvert(32, IR::U64{inst.Arg(0)}); + } + // Subtract the least significant 32 bits from the guest offset. The result is the storage + // buffer offset in bytes. + const IR::U32 low_cbuf{ir.GetCbuf(ir.Imm32(buffer.index), ir.Imm32(buffer.offset))}; + return ir.ISub(offset, low_cbuf); +} + +/// Replace a global memory load instruction with its storage buffer equivalent +void ReplaceLoad(IR::Block& block, IR::Inst& inst, const IR::U32& storage_index, + const IR::U32& offset) { + const IR::Opcode new_opcode{GlobalToStorage(inst.GetOpcode())}; + const auto it{IR::Block::InstructionList::s_iterator_to(inst)}; + const IR::Value value{&*block.PrependNewInst(it, new_opcode, {storage_index, offset})}; + inst.ReplaceUsesWith(value); +} + +/// Replace a global memory write instruction with its storage buffer equivalent +void ReplaceWrite(IR::Block& block, IR::Inst& inst, const IR::U32& storage_index, + const IR::U32& offset) { + const IR::Opcode new_opcode{GlobalToStorage(inst.GetOpcode())}; + const auto it{IR::Block::InstructionList::s_iterator_to(inst)}; + block.PrependNewInst(it, new_opcode, {storage_index, offset, inst.Arg(1)}); + inst.Invalidate(); +} + +/// Replace an atomic operation on global memory instruction with its storage buffer equivalent +void ReplaceAtomic(IR::Block& block, IR::Inst& inst, const IR::U32& storage_index, + const IR::U32& offset) { + const IR::Opcode new_opcode{GlobalToStorage(inst.GetOpcode())}; + const auto it{IR::Block::InstructionList::s_iterator_to(inst)}; + const IR::Value value{ + &*block.PrependNewInst(it, new_opcode, {storage_index, offset, inst.Arg(1)})}; + inst.ReplaceUsesWith(value); +} + +/// Replace a global memory instruction with its storage buffer equivalent +void Replace(IR::Block& block, IR::Inst& inst, const IR::U32& storage_index, + const IR::U32& offset) { + switch (inst.GetOpcode()) { + case IR::Opcode::LoadGlobalS8: + case IR::Opcode::LoadGlobalU8: + case IR::Opcode::LoadGlobalS16: + case IR::Opcode::LoadGlobalU16: + case IR::Opcode::LoadGlobal32: + case IR::Opcode::LoadGlobal64: + case IR::Opcode::LoadGlobal128: + return ReplaceLoad(block, inst, storage_index, offset); + case IR::Opcode::WriteGlobalS8: + case IR::Opcode::WriteGlobalU8: + case IR::Opcode::WriteGlobalS16: + case IR::Opcode::WriteGlobalU16: + case IR::Opcode::WriteGlobal32: + case IR::Opcode::WriteGlobal64: + case IR::Opcode::WriteGlobal128: + return ReplaceWrite(block, inst, storage_index, offset); + case IR::Opcode::GlobalAtomicIAdd32: + case IR::Opcode::GlobalAtomicSMin32: + case IR::Opcode::GlobalAtomicUMin32: + case IR::Opcode::GlobalAtomicSMax32: + case IR::Opcode::GlobalAtomicUMax32: + case IR::Opcode::GlobalAtomicInc32: + case IR::Opcode::GlobalAtomicDec32: + case IR::Opcode::GlobalAtomicAnd32: + case IR::Opcode::GlobalAtomicOr32: + case IR::Opcode::GlobalAtomicXor32: + case IR::Opcode::GlobalAtomicExchange32: + case IR::Opcode::GlobalAtomicIAdd64: + case IR::Opcode::GlobalAtomicSMin64: + case IR::Opcode::GlobalAtomicUMin64: + case IR::Opcode::GlobalAtomicSMax64: + case IR::Opcode::GlobalAtomicUMax64: + case IR::Opcode::GlobalAtomicAnd64: + case IR::Opcode::GlobalAtomicOr64: + case IR::Opcode::GlobalAtomicXor64: + case IR::Opcode::GlobalAtomicExchange64: + case IR::Opcode::GlobalAtomicAddF32: + case IR::Opcode::GlobalAtomicAddF16x2: + case IR::Opcode::GlobalAtomicAddF32x2: + case IR::Opcode::GlobalAtomicMinF16x2: + case IR::Opcode::GlobalAtomicMinF32x2: + case IR::Opcode::GlobalAtomicMaxF16x2: + case IR::Opcode::GlobalAtomicMaxF32x2: + return ReplaceAtomic(block, inst, storage_index, offset); + default: + throw InvalidArgument("Invalid global memory opcode {}", inst.GetOpcode()); + } +} +} // Anonymous namespace + +void GlobalMemoryToStorageBufferPass(IR::Program& program) { + StorageInfo info; + for (IR::Block* const block : program.post_order_blocks) { + for (IR::Inst& inst : block->Instructions()) { + if (!IsGlobalMemory(inst)) { + continue; + } + CollectStorageBuffers(*block, inst, info); + } + } + for (const StorageBufferAddr& storage_buffer : info.set) { + program.info.storage_buffers_descriptors.push_back({ + .cbuf_index = storage_buffer.index, + .cbuf_offset = storage_buffer.offset, + .count = 1, + .is_written = info.writes.contains(storage_buffer), + }); + } + for (const StorageInst& storage_inst : info.to_replace) { + const StorageBufferAddr storage_buffer{storage_inst.storage_buffer}; + const auto it{info.set.find(storage_inst.storage_buffer)}; + const IR::U32 index{IR::Value{static_cast<u32>(info.set.index_of(it))}}; + IR::Block* const block{storage_inst.block}; + IR::Inst* const inst{storage_inst.inst}; + const IR::U32 offset{StorageOffset(*block, *inst, storage_buffer)}; + Replace(*block, *inst, index, offset); + } +} + +template <typename Descriptors, typename Descriptor, typename Func> +static u32 Add(Descriptors& descriptors, const Descriptor& desc, Func&& pred) { + // TODO: Handle arrays + const auto it{std::ranges::find_if(descriptors, pred)}; + if (it != descriptors.end()) { + return static_cast<u32>(std::distance(descriptors.begin(), it)); + } + descriptors.push_back(desc); + return static_cast<u32>(descriptors.size()) - 1; +} + +void JoinStorageInfo(Info& base, Info& source) { + auto& descriptors = base.storage_buffers_descriptors; + for (auto& desc : source.storage_buffers_descriptors) { + auto it{std::ranges::find_if(descriptors, [&desc](const auto& existing) { + return desc.cbuf_index == existing.cbuf_index && + desc.cbuf_offset == existing.cbuf_offset && desc.count == existing.count; + })}; + if (it != descriptors.end()) { + it->is_written |= desc.is_written; + continue; + } + descriptors.push_back(desc); + } +} + +} // namespace Shader::Optimization diff --git a/src/shader_recompiler/ir_opt/identity_removal_pass.cpp b/src/shader_recompiler/ir_opt/identity_removal_pass.cpp new file mode 100644 index 000000000..e9b55f835 --- /dev/null +++ b/src/shader_recompiler/ir_opt/identity_removal_pass.cpp @@ -0,0 +1,38 @@ +// Copyright 2021 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <vector> + +#include "shader_recompiler/frontend/ir/basic_block.h" +#include "shader_recompiler/frontend/ir/value.h" +#include "shader_recompiler/ir_opt/passes.h" + +namespace Shader::Optimization { + +void IdentityRemovalPass(IR::Program& program) { + std::vector<IR::Inst*> to_invalidate; + for (IR::Block* const block : program.blocks) { + for (auto inst = block->begin(); inst != block->end();) { + const size_t num_args{inst->NumArgs()}; + for (size_t i = 0; i < num_args; ++i) { + IR::Value arg; + while ((arg = inst->Arg(i)).IsIdentity()) { + inst->SetArg(i, arg.Inst()->Arg(0)); + } + } + if (inst->GetOpcode() == IR::Opcode::Identity || + inst->GetOpcode() == IR::Opcode::Void) { + to_invalidate.push_back(&*inst); + inst = block->Instructions().erase(inst); + } else { + ++inst; + } + } + } + for (IR::Inst* const inst : to_invalidate) { + inst->Invalidate(); + } +} + +} // namespace Shader::Optimization diff --git a/src/shader_recompiler/ir_opt/lower_fp16_to_fp32.cpp b/src/shader_recompiler/ir_opt/lower_fp16_to_fp32.cpp new file mode 100644 index 000000000..773e1f961 --- /dev/null +++ b/src/shader_recompiler/ir_opt/lower_fp16_to_fp32.cpp @@ -0,0 +1,143 @@ +// Copyright 2021 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <algorithm> + +#include "shader_recompiler/frontend/ir/ir_emitter.h" +#include "shader_recompiler/frontend/ir/value.h" +#include "shader_recompiler/ir_opt/passes.h" + +namespace Shader::Optimization { +namespace { +IR::Opcode Replace(IR::Opcode op) { + switch (op) { + case IR::Opcode::FPAbs16: + return IR::Opcode::FPAbs32; + case IR::Opcode::FPAdd16: + return IR::Opcode::FPAdd32; + case IR::Opcode::FPCeil16: + return IR::Opcode::FPCeil32; + case IR::Opcode::FPFloor16: + return IR::Opcode::FPFloor32; + case IR::Opcode::FPFma16: + return IR::Opcode::FPFma32; + case IR::Opcode::FPMul16: + return IR::Opcode::FPMul32; + case IR::Opcode::FPNeg16: + return IR::Opcode::FPNeg32; + case IR::Opcode::FPRoundEven16: + return IR::Opcode::FPRoundEven32; + case IR::Opcode::FPSaturate16: + return IR::Opcode::FPSaturate32; + case IR::Opcode::FPClamp16: + return IR::Opcode::FPClamp32; + case IR::Opcode::FPTrunc16: + return IR::Opcode::FPTrunc32; + case IR::Opcode::CompositeConstructF16x2: + return IR::Opcode::CompositeConstructF32x2; + case IR::Opcode::CompositeConstructF16x3: + return IR::Opcode::CompositeConstructF32x3; + case IR::Opcode::CompositeConstructF16x4: + return IR::Opcode::CompositeConstructF32x4; + case IR::Opcode::CompositeExtractF16x2: + return IR::Opcode::CompositeExtractF32x2; + case IR::Opcode::CompositeExtractF16x3: + return IR::Opcode::CompositeExtractF32x3; + case IR::Opcode::CompositeExtractF16x4: + return IR::Opcode::CompositeExtractF32x4; + case IR::Opcode::CompositeInsertF16x2: + return IR::Opcode::CompositeInsertF32x2; + case IR::Opcode::CompositeInsertF16x3: + return IR::Opcode::CompositeInsertF32x3; + case IR::Opcode::CompositeInsertF16x4: + return IR::Opcode::CompositeInsertF32x4; + case IR::Opcode::FPOrdEqual16: + return IR::Opcode::FPOrdEqual32; + case IR::Opcode::FPUnordEqual16: + return IR::Opcode::FPUnordEqual32; + case IR::Opcode::FPOrdNotEqual16: + return IR::Opcode::FPOrdNotEqual32; + case IR::Opcode::FPUnordNotEqual16: + return IR::Opcode::FPUnordNotEqual32; + case IR::Opcode::FPOrdLessThan16: + return IR::Opcode::FPOrdLessThan32; + case IR::Opcode::FPUnordLessThan16: + return IR::Opcode::FPUnordLessThan32; + case IR::Opcode::FPOrdGreaterThan16: + return IR::Opcode::FPOrdGreaterThan32; + case IR::Opcode::FPUnordGreaterThan16: + return IR::Opcode::FPUnordGreaterThan32; + case IR::Opcode::FPOrdLessThanEqual16: + return IR::Opcode::FPOrdLessThanEqual32; + case IR::Opcode::FPUnordLessThanEqual16: + return IR::Opcode::FPUnordLessThanEqual32; + case IR::Opcode::FPOrdGreaterThanEqual16: + return IR::Opcode::FPOrdGreaterThanEqual32; + case IR::Opcode::FPUnordGreaterThanEqual16: + return IR::Opcode::FPUnordGreaterThanEqual32; + case IR::Opcode::FPIsNan16: + return IR::Opcode::FPIsNan32; + case IR::Opcode::ConvertS16F16: + return IR::Opcode::ConvertS16F32; + case IR::Opcode::ConvertS32F16: + return IR::Opcode::ConvertS32F32; + case IR::Opcode::ConvertS64F16: + return IR::Opcode::ConvertS64F32; + case IR::Opcode::ConvertU16F16: + return IR::Opcode::ConvertU16F32; + case IR::Opcode::ConvertU32F16: + return IR::Opcode::ConvertU32F32; + case IR::Opcode::ConvertU64F16: + return IR::Opcode::ConvertU64F32; + case IR::Opcode::PackFloat2x16: + return IR::Opcode::PackHalf2x16; + case IR::Opcode::UnpackFloat2x16: + return IR::Opcode::UnpackHalf2x16; + case IR::Opcode::ConvertF32F16: + return IR::Opcode::Identity; + case IR::Opcode::ConvertF16F32: + return IR::Opcode::Identity; + case IR::Opcode::ConvertF16S8: + return IR::Opcode::ConvertF32S8; + case IR::Opcode::ConvertF16S16: + return IR::Opcode::ConvertF32S16; + case IR::Opcode::ConvertF16S32: + return IR::Opcode::ConvertF32S32; + case IR::Opcode::ConvertF16S64: + return IR::Opcode::ConvertF32S64; + case IR::Opcode::ConvertF16U8: + return IR::Opcode::ConvertF32U8; + case IR::Opcode::ConvertF16U16: + return IR::Opcode::ConvertF32U16; + case IR::Opcode::ConvertF16U32: + return IR::Opcode::ConvertF32U32; + case IR::Opcode::ConvertF16U64: + return IR::Opcode::ConvertF32U64; + case IR::Opcode::GlobalAtomicAddF16x2: + return IR::Opcode::GlobalAtomicAddF32x2; + case IR::Opcode::StorageAtomicAddF16x2: + return IR::Opcode::StorageAtomicAddF32x2; + case IR::Opcode::GlobalAtomicMinF16x2: + return IR::Opcode::GlobalAtomicMinF32x2; + case IR::Opcode::StorageAtomicMinF16x2: + return IR::Opcode::StorageAtomicMinF32x2; + case IR::Opcode::GlobalAtomicMaxF16x2: + return IR::Opcode::GlobalAtomicMaxF32x2; + case IR::Opcode::StorageAtomicMaxF16x2: + return IR::Opcode::StorageAtomicMaxF32x2; + default: + return op; + } +} +} // Anonymous namespace + +void LowerFp16ToFp32(IR::Program& program) { + for (IR::Block* const block : program.blocks) { + for (IR::Inst& inst : block->Instructions()) { + inst.ReplaceOpcode(Replace(inst.GetOpcode())); + } + } +} + +} // namespace Shader::Optimization diff --git a/src/shader_recompiler/ir_opt/lower_int64_to_int32.cpp b/src/shader_recompiler/ir_opt/lower_int64_to_int32.cpp new file mode 100644 index 000000000..e80d3d1d9 --- /dev/null +++ b/src/shader_recompiler/ir_opt/lower_int64_to_int32.cpp @@ -0,0 +1,218 @@ +// Copyright 2021 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <utility> + +#include "shader_recompiler/exception.h" +#include "shader_recompiler/frontend/ir/basic_block.h" +#include "shader_recompiler/frontend/ir/ir_emitter.h" +#include "shader_recompiler/frontend/ir/program.h" +#include "shader_recompiler/frontend/ir/value.h" +#include "shader_recompiler/ir_opt/passes.h" + +namespace Shader::Optimization { +namespace { +std::pair<IR::U32, IR::U32> Unpack(IR::IREmitter& ir, const IR::Value& packed) { + if (packed.IsImmediate()) { + const u64 value{packed.U64()}; + return { + ir.Imm32(static_cast<u32>(value)), + ir.Imm32(static_cast<u32>(value >> 32)), + }; + } else { + return std::pair<IR::U32, IR::U32>{ + ir.CompositeExtract(packed, 0u), + ir.CompositeExtract(packed, 1u), + }; + } +} + +void IAdd64To32(IR::Block& block, IR::Inst& inst) { + if (inst.HasAssociatedPseudoOperation()) { + throw NotImplementedException("IAdd64 emulation with pseudo instructions"); + } + IR::IREmitter ir(block, IR::Block::InstructionList::s_iterator_to(inst)); + const auto [a_lo, a_hi]{Unpack(ir, inst.Arg(0))}; + const auto [b_lo, b_hi]{Unpack(ir, inst.Arg(1))}; + + const IR::U32 ret_lo{ir.IAdd(a_lo, b_lo)}; + const IR::U32 carry{ir.Select(ir.GetCarryFromOp(ret_lo), ir.Imm32(1u), ir.Imm32(0u))}; + + const IR::U32 ret_hi{ir.IAdd(ir.IAdd(a_hi, b_hi), carry)}; + inst.ReplaceUsesWith(ir.CompositeConstruct(ret_lo, ret_hi)); +} + +void ISub64To32(IR::Block& block, IR::Inst& inst) { + if (inst.HasAssociatedPseudoOperation()) { + throw NotImplementedException("ISub64 emulation with pseudo instructions"); + } + IR::IREmitter ir(block, IR::Block::InstructionList::s_iterator_to(inst)); + const auto [a_lo, a_hi]{Unpack(ir, inst.Arg(0))}; + const auto [b_lo, b_hi]{Unpack(ir, inst.Arg(1))}; + + const IR::U32 ret_lo{ir.ISub(a_lo, b_lo)}; + const IR::U1 underflow{ir.IGreaterThan(ret_lo, a_lo, false)}; + const IR::U32 underflow_bit{ir.Select(underflow, ir.Imm32(1u), ir.Imm32(0u))}; + + const IR::U32 ret_hi{ir.ISub(ir.ISub(a_hi, b_hi), underflow_bit)}; + inst.ReplaceUsesWith(ir.CompositeConstruct(ret_lo, ret_hi)); +} + +void INeg64To32(IR::Block& block, IR::Inst& inst) { + if (inst.HasAssociatedPseudoOperation()) { + throw NotImplementedException("INeg64 emulation with pseudo instructions"); + } + IR::IREmitter ir(block, IR::Block::InstructionList::s_iterator_to(inst)); + auto [lo, hi]{Unpack(ir, inst.Arg(0))}; + lo = ir.BitwiseNot(lo); + hi = ir.BitwiseNot(hi); + + lo = ir.IAdd(lo, ir.Imm32(1)); + + const IR::U32 carry{ir.Select(ir.GetCarryFromOp(lo), ir.Imm32(1u), ir.Imm32(0u))}; + hi = ir.IAdd(hi, carry); + + inst.ReplaceUsesWith(ir.CompositeConstruct(lo, hi)); +} + +void ShiftLeftLogical64To32(IR::Block& block, IR::Inst& inst) { + if (inst.HasAssociatedPseudoOperation()) { + throw NotImplementedException("ShiftLeftLogical64 emulation with pseudo instructions"); + } + IR::IREmitter ir(block, IR::Block::InstructionList::s_iterator_to(inst)); + const auto [lo, hi]{Unpack(ir, inst.Arg(0))}; + const IR::U32 shift{inst.Arg(1)}; + + const IR::U32 shifted_lo{ir.ShiftLeftLogical(lo, shift)}; + const IR::U32 shifted_hi{ir.ShiftLeftLogical(hi, shift)}; + + const IR::U32 inv_shift{ir.ISub(shift, ir.Imm32(32))}; + const IR::U1 is_long{ir.IGreaterThanEqual(inv_shift, ir.Imm32(0), true)}; + const IR::U1 is_zero{ir.IEqual(shift, ir.Imm32(0))}; + + const IR::U32 long_ret_lo{ir.Imm32(0)}; + const IR::U32 long_ret_hi{ir.ShiftLeftLogical(lo, inv_shift)}; + + const IR::U32 shift_complement{ir.ISub(ir.Imm32(32), shift)}; + const IR::U32 lo_extract{ir.BitFieldExtract(lo, shift_complement, shift, false)}; + const IR::U32 short_ret_lo{shifted_lo}; + const IR::U32 short_ret_hi{ir.BitwiseOr(shifted_hi, lo_extract)}; + + const IR::U32 zero_ret_lo{lo}; + const IR::U32 zero_ret_hi{hi}; + + const IR::U32 non_zero_lo{ir.Select(is_long, long_ret_lo, short_ret_lo)}; + const IR::U32 non_zero_hi{ir.Select(is_long, long_ret_hi, short_ret_hi)}; + + const IR::U32 ret_lo{ir.Select(is_zero, zero_ret_lo, non_zero_lo)}; + const IR::U32 ret_hi{ir.Select(is_zero, zero_ret_hi, non_zero_hi)}; + inst.ReplaceUsesWith(ir.CompositeConstruct(ret_lo, ret_hi)); +} + +void ShiftRightLogical64To32(IR::Block& block, IR::Inst& inst) { + if (inst.HasAssociatedPseudoOperation()) { + throw NotImplementedException("ShiftRightLogical64 emulation with pseudo instructions"); + } + IR::IREmitter ir(block, IR::Block::InstructionList::s_iterator_to(inst)); + const auto [lo, hi]{Unpack(ir, inst.Arg(0))}; + const IR::U32 shift{inst.Arg(1)}; + + const IR::U32 shifted_lo{ir.ShiftRightLogical(lo, shift)}; + const IR::U32 shifted_hi{ir.ShiftRightLogical(hi, shift)}; + + const IR::U32 inv_shift{ir.ISub(shift, ir.Imm32(32))}; + const IR::U1 is_long{ir.IGreaterThanEqual(inv_shift, ir.Imm32(0), true)}; + const IR::U1 is_zero{ir.IEqual(shift, ir.Imm32(0))}; + + const IR::U32 long_ret_hi{ir.Imm32(0)}; + const IR::U32 long_ret_lo{ir.ShiftRightLogical(hi, inv_shift)}; + + const IR::U32 shift_complement{ir.ISub(ir.Imm32(32), shift)}; + const IR::U32 short_hi_extract{ir.BitFieldExtract(hi, ir.Imm32(0), shift)}; + const IR::U32 short_ret_hi{shifted_hi}; + const IR::U32 short_ret_lo{ + ir.BitFieldInsert(shifted_lo, short_hi_extract, shift_complement, shift)}; + + const IR::U32 zero_ret_lo{lo}; + const IR::U32 zero_ret_hi{hi}; + + const IR::U32 non_zero_lo{ir.Select(is_long, long_ret_lo, short_ret_lo)}; + const IR::U32 non_zero_hi{ir.Select(is_long, long_ret_hi, short_ret_hi)}; + + const IR::U32 ret_lo{ir.Select(is_zero, zero_ret_lo, non_zero_lo)}; + const IR::U32 ret_hi{ir.Select(is_zero, zero_ret_hi, non_zero_hi)}; + inst.ReplaceUsesWith(ir.CompositeConstruct(ret_lo, ret_hi)); +} + +void ShiftRightArithmetic64To32(IR::Block& block, IR::Inst& inst) { + if (inst.HasAssociatedPseudoOperation()) { + throw NotImplementedException("ShiftRightArithmetic64 emulation with pseudo instructions"); + } + IR::IREmitter ir(block, IR::Block::InstructionList::s_iterator_to(inst)); + const auto [lo, hi]{Unpack(ir, inst.Arg(0))}; + const IR::U32 shift{inst.Arg(1)}; + + const IR::U32 shifted_lo{ir.ShiftRightLogical(lo, shift)}; + const IR::U32 shifted_hi{ir.ShiftRightArithmetic(hi, shift)}; + + const IR::U32 sign_extension{ir.ShiftRightArithmetic(hi, ir.Imm32(31))}; + + const IR::U32 inv_shift{ir.ISub(shift, ir.Imm32(32))}; + const IR::U1 is_long{ir.IGreaterThanEqual(inv_shift, ir.Imm32(0), true)}; + const IR::U1 is_zero{ir.IEqual(shift, ir.Imm32(0))}; + + const IR::U32 long_ret_hi{sign_extension}; + const IR::U32 long_ret_lo{ir.ShiftRightArithmetic(hi, inv_shift)}; + + const IR::U32 shift_complement{ir.ISub(ir.Imm32(32), shift)}; + const IR::U32 short_hi_extract(ir.BitFieldExtract(hi, ir.Imm32(0), shift)); + const IR::U32 short_ret_hi{shifted_hi}; + const IR::U32 short_ret_lo{ + ir.BitFieldInsert(shifted_lo, short_hi_extract, shift_complement, shift)}; + + const IR::U32 zero_ret_lo{lo}; + const IR::U32 zero_ret_hi{hi}; + + const IR::U32 non_zero_lo{ir.Select(is_long, long_ret_lo, short_ret_lo)}; + const IR::U32 non_zero_hi{ir.Select(is_long, long_ret_hi, short_ret_hi)}; + + const IR::U32 ret_lo{ir.Select(is_zero, zero_ret_lo, non_zero_lo)}; + const IR::U32 ret_hi{ir.Select(is_zero, zero_ret_hi, non_zero_hi)}; + inst.ReplaceUsesWith(ir.CompositeConstruct(ret_lo, ret_hi)); +} + +void Lower(IR::Block& block, IR::Inst& inst) { + switch (inst.GetOpcode()) { + case IR::Opcode::PackUint2x32: + case IR::Opcode::UnpackUint2x32: + return inst.ReplaceOpcode(IR::Opcode::Identity); + case IR::Opcode::IAdd64: + return IAdd64To32(block, inst); + case IR::Opcode::ISub64: + return ISub64To32(block, inst); + case IR::Opcode::INeg64: + return INeg64To32(block, inst); + case IR::Opcode::ShiftLeftLogical64: + return ShiftLeftLogical64To32(block, inst); + case IR::Opcode::ShiftRightLogical64: + return ShiftRightLogical64To32(block, inst); + case IR::Opcode::ShiftRightArithmetic64: + return ShiftRightArithmetic64To32(block, inst); + default: + break; + } +} +} // Anonymous namespace + +void LowerInt64ToInt32(IR::Program& program) { + const auto end{program.post_order_blocks.rend()}; + for (auto it = program.post_order_blocks.rbegin(); it != end; ++it) { + IR::Block* const block{*it}; + for (IR::Inst& inst : block->Instructions()) { + Lower(*block, inst); + } + } +} + +} // namespace Shader::Optimization diff --git a/src/shader_recompiler/ir_opt/passes.h b/src/shader_recompiler/ir_opt/passes.h new file mode 100644 index 000000000..2f89b1ea0 --- /dev/null +++ b/src/shader_recompiler/ir_opt/passes.h @@ -0,0 +1,32 @@ +// Copyright 2021 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <span> + +#include "shader_recompiler/environment.h" +#include "shader_recompiler/frontend/ir/basic_block.h" +#include "shader_recompiler/frontend/ir/program.h" + +namespace Shader::Optimization { + +void CollectShaderInfoPass(Environment& env, IR::Program& program); +void ConstantPropagationPass(IR::Program& program); +void DeadCodeEliminationPass(IR::Program& program); +void GlobalMemoryToStorageBufferPass(IR::Program& program); +void IdentityRemovalPass(IR::Program& program); +void LowerFp16ToFp32(IR::Program& program); +void LowerInt64ToInt32(IR::Program& program); +void SsaRewritePass(IR::Program& program); +void TexturePass(Environment& env, IR::Program& program); +void VerificationPass(const IR::Program& program); + +// Dual Vertex +void VertexATransformPass(IR::Program& program); +void VertexBTransformPass(IR::Program& program); +void JoinTextureInfo(Info& base, Info& source); +void JoinStorageInfo(Info& base, Info& source); + +} // namespace Shader::Optimization diff --git a/src/shader_recompiler/ir_opt/ssa_rewrite_pass.cpp b/src/shader_recompiler/ir_opt/ssa_rewrite_pass.cpp new file mode 100644 index 000000000..53145fb5e --- /dev/null +++ b/src/shader_recompiler/ir_opt/ssa_rewrite_pass.cpp @@ -0,0 +1,383 @@ +// Copyright 2021 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +// This file implements the SSA rewriting algorithm proposed in +// +// Simple and Efficient Construction of Static Single Assignment Form. +// Braun M., Buchwald S., Hack S., Leiba R., Mallon C., Zwinkau A. (2013) +// In: Jhala R., De Bosschere K. (eds) +// Compiler Construction. CC 2013. +// Lecture Notes in Computer Science, vol 7791. +// Springer, Berlin, Heidelberg +// +// https://link.springer.com/chapter/10.1007/978-3-642-37051-9_6 +// + +#include <span> +#include <variant> +#include <vector> + +#include <boost/container/flat_map.hpp> +#include <boost/container/flat_set.hpp> + +#include "shader_recompiler/frontend/ir/basic_block.h" +#include "shader_recompiler/frontend/ir/opcodes.h" +#include "shader_recompiler/frontend/ir/pred.h" +#include "shader_recompiler/frontend/ir/reg.h" +#include "shader_recompiler/frontend/ir/value.h" +#include "shader_recompiler/ir_opt/passes.h" + +namespace Shader::Optimization { +namespace { +struct FlagTag { + auto operator<=>(const FlagTag&) const noexcept = default; +}; +struct ZeroFlagTag : FlagTag {}; +struct SignFlagTag : FlagTag {}; +struct CarryFlagTag : FlagTag {}; +struct OverflowFlagTag : FlagTag {}; + +struct GotoVariable : FlagTag { + GotoVariable() = default; + explicit GotoVariable(u32 index_) : index{index_} {} + + auto operator<=>(const GotoVariable&) const noexcept = default; + + u32 index; +}; + +struct IndirectBranchVariable { + auto operator<=>(const IndirectBranchVariable&) const noexcept = default; +}; + +using Variant = std::variant<IR::Reg, IR::Pred, ZeroFlagTag, SignFlagTag, CarryFlagTag, + OverflowFlagTag, GotoVariable, IndirectBranchVariable>; +using ValueMap = boost::container::flat_map<IR::Block*, IR::Value>; + +struct DefTable { + const IR::Value& Def(IR::Block* block, IR::Reg variable) { + return block->SsaRegValue(variable); + } + void SetDef(IR::Block* block, IR::Reg variable, const IR::Value& value) { + block->SetSsaRegValue(variable, value); + } + + const IR::Value& Def(IR::Block* block, IR::Pred variable) { + return preds[IR::PredIndex(variable)][block]; + } + void SetDef(IR::Block* block, IR::Pred variable, const IR::Value& value) { + preds[IR::PredIndex(variable)].insert_or_assign(block, value); + } + + const IR::Value& Def(IR::Block* block, GotoVariable variable) { + return goto_vars[variable.index][block]; + } + void SetDef(IR::Block* block, GotoVariable variable, const IR::Value& value) { + goto_vars[variable.index].insert_or_assign(block, value); + } + + const IR::Value& Def(IR::Block* block, IndirectBranchVariable) { + return indirect_branch_var[block]; + } + void SetDef(IR::Block* block, IndirectBranchVariable, const IR::Value& value) { + indirect_branch_var.insert_or_assign(block, value); + } + + const IR::Value& Def(IR::Block* block, ZeroFlagTag) { + return zero_flag[block]; + } + void SetDef(IR::Block* block, ZeroFlagTag, const IR::Value& value) { + zero_flag.insert_or_assign(block, value); + } + + const IR::Value& Def(IR::Block* block, SignFlagTag) { + return sign_flag[block]; + } + void SetDef(IR::Block* block, SignFlagTag, const IR::Value& value) { + sign_flag.insert_or_assign(block, value); + } + + const IR::Value& Def(IR::Block* block, CarryFlagTag) { + return carry_flag[block]; + } + void SetDef(IR::Block* block, CarryFlagTag, const IR::Value& value) { + carry_flag.insert_or_assign(block, value); + } + + const IR::Value& Def(IR::Block* block, OverflowFlagTag) { + return overflow_flag[block]; + } + void SetDef(IR::Block* block, OverflowFlagTag, const IR::Value& value) { + overflow_flag.insert_or_assign(block, value); + } + + std::array<ValueMap, IR::NUM_USER_PREDS> preds; + boost::container::flat_map<u32, ValueMap> goto_vars; + ValueMap indirect_branch_var; + ValueMap zero_flag; + ValueMap sign_flag; + ValueMap carry_flag; + ValueMap overflow_flag; +}; + +IR::Opcode UndefOpcode(IR::Reg) noexcept { + return IR::Opcode::UndefU32; +} + +IR::Opcode UndefOpcode(IR::Pred) noexcept { + return IR::Opcode::UndefU1; +} + +IR::Opcode UndefOpcode(const FlagTag&) noexcept { + return IR::Opcode::UndefU1; +} + +IR::Opcode UndefOpcode(IndirectBranchVariable) noexcept { + return IR::Opcode::UndefU32; +} + +enum class Status { + Start, + SetValue, + PreparePhiArgument, + PushPhiArgument, +}; + +template <typename Type> +struct ReadState { + ReadState(IR::Block* block_) : block{block_} {} + ReadState() = default; + + IR::Block* block{}; + IR::Value result{}; + IR::Inst* phi{}; + IR::Block* const* pred_it{}; + IR::Block* const* pred_end{}; + Status pc{Status::Start}; +}; + +class Pass { +public: + template <typename Type> + void WriteVariable(Type variable, IR::Block* block, const IR::Value& value) { + current_def.SetDef(block, variable, value); + } + + template <typename Type> + IR::Value ReadVariable(Type variable, IR::Block* root_block) { + boost::container::small_vector<ReadState<Type>, 64> stack{ + ReadState<Type>(nullptr), + ReadState<Type>(root_block), + }; + const auto prepare_phi_operand{[&] { + if (stack.back().pred_it == stack.back().pred_end) { + IR::Inst* const phi{stack.back().phi}; + IR::Block* const block{stack.back().block}; + const IR::Value result{TryRemoveTrivialPhi(*phi, block, UndefOpcode(variable))}; + stack.pop_back(); + stack.back().result = result; + WriteVariable(variable, block, result); + } else { + IR::Block* const imm_pred{*stack.back().pred_it}; + stack.back().pc = Status::PushPhiArgument; + stack.emplace_back(imm_pred); + } + }}; + do { + IR::Block* const block{stack.back().block}; + switch (stack.back().pc) { + case Status::Start: { + if (const IR::Value& def = current_def.Def(block, variable); !def.IsEmpty()) { + stack.back().result = def; + } else if (!block->IsSsaSealed()) { + // Incomplete CFG + IR::Inst* phi{&*block->PrependNewInst(block->begin(), IR::Opcode::Phi)}; + phi->SetFlags(IR::TypeOf(UndefOpcode(variable))); + + incomplete_phis[block].insert_or_assign(variable, phi); + stack.back().result = IR::Value{&*phi}; + } else if (const std::span imm_preds = block->ImmPredecessors(); + imm_preds.size() == 1) { + // Optimize the common case of one predecessor: no phi needed + stack.back().pc = Status::SetValue; + stack.emplace_back(imm_preds.front()); + break; + } else { + // Break potential cycles with operandless phi + IR::Inst* const phi{&*block->PrependNewInst(block->begin(), IR::Opcode::Phi)}; + phi->SetFlags(IR::TypeOf(UndefOpcode(variable))); + + WriteVariable(variable, block, IR::Value{phi}); + + stack.back().phi = phi; + stack.back().pred_it = imm_preds.data(); + stack.back().pred_end = imm_preds.data() + imm_preds.size(); + prepare_phi_operand(); + break; + } + } + [[fallthrough]]; + case Status::SetValue: { + const IR::Value result{stack.back().result}; + WriteVariable(variable, block, result); + stack.pop_back(); + stack.back().result = result; + break; + } + case Status::PushPhiArgument: { + IR::Inst* const phi{stack.back().phi}; + phi->AddPhiOperand(*stack.back().pred_it, stack.back().result); + ++stack.back().pred_it; + } + [[fallthrough]]; + case Status::PreparePhiArgument: + prepare_phi_operand(); + break; + } + } while (stack.size() > 1); + return stack.back().result; + } + + void SealBlock(IR::Block* block) { + const auto it{incomplete_phis.find(block)}; + if (it != incomplete_phis.end()) { + for (auto& pair : it->second) { + auto& variant{pair.first}; + auto& phi{pair.second}; + std::visit([&](auto& variable) { AddPhiOperands(variable, *phi, block); }, variant); + } + } + block->SsaSeal(); + } + +private: + template <typename Type> + IR::Value AddPhiOperands(Type variable, IR::Inst& phi, IR::Block* block) { + for (IR::Block* const imm_pred : block->ImmPredecessors()) { + phi.AddPhiOperand(imm_pred, ReadVariable(variable, imm_pred)); + } + return TryRemoveTrivialPhi(phi, block, UndefOpcode(variable)); + } + + IR::Value TryRemoveTrivialPhi(IR::Inst& phi, IR::Block* block, IR::Opcode undef_opcode) { + IR::Value same; + const size_t num_args{phi.NumArgs()}; + for (size_t arg_index = 0; arg_index < num_args; ++arg_index) { + const IR::Value& op{phi.Arg(arg_index)}; + if (op.Resolve() == same.Resolve() || op == IR::Value{&phi}) { + // Unique value or self-reference + continue; + } + if (!same.IsEmpty()) { + // The phi merges at least two values: not trivial + return IR::Value{&phi}; + } + same = op; + } + // Remove the phi node from the block, it will be reinserted + IR::Block::InstructionList& list{block->Instructions()}; + list.erase(IR::Block::InstructionList::s_iterator_to(phi)); + + // Find the first non-phi instruction and use it as an insertion point + IR::Block::iterator reinsert_point{std::ranges::find_if_not(list, IR::IsPhi)}; + if (same.IsEmpty()) { + // The phi is unreachable or in the start block + // Insert an undefined instruction and make it the phi node replacement + // The "phi" node reinsertion point is specified after this instruction + reinsert_point = block->PrependNewInst(reinsert_point, undef_opcode); + same = IR::Value{&*reinsert_point}; + ++reinsert_point; + } + // Reinsert the phi node and reroute all its uses to the "same" value + list.insert(reinsert_point, phi); + phi.ReplaceUsesWith(same); + // TODO: Try to recursively remove all phi users, which might have become trivial + return same; + } + + boost::container::flat_map<IR::Block*, boost::container::flat_map<Variant, IR::Inst*>> + incomplete_phis; + DefTable current_def; +}; + +void VisitInst(Pass& pass, IR::Block* block, IR::Inst& inst) { + switch (inst.GetOpcode()) { + case IR::Opcode::SetRegister: + if (const IR::Reg reg{inst.Arg(0).Reg()}; reg != IR::Reg::RZ) { + pass.WriteVariable(reg, block, inst.Arg(1)); + } + break; + case IR::Opcode::SetPred: + if (const IR::Pred pred{inst.Arg(0).Pred()}; pred != IR::Pred::PT) { + pass.WriteVariable(pred, block, inst.Arg(1)); + } + break; + case IR::Opcode::SetGotoVariable: + pass.WriteVariable(GotoVariable{inst.Arg(0).U32()}, block, inst.Arg(1)); + break; + case IR::Opcode::SetIndirectBranchVariable: + pass.WriteVariable(IndirectBranchVariable{}, block, inst.Arg(0)); + break; + case IR::Opcode::SetZFlag: + pass.WriteVariable(ZeroFlagTag{}, block, inst.Arg(0)); + break; + case IR::Opcode::SetSFlag: + pass.WriteVariable(SignFlagTag{}, block, inst.Arg(0)); + break; + case IR::Opcode::SetCFlag: + pass.WriteVariable(CarryFlagTag{}, block, inst.Arg(0)); + break; + case IR::Opcode::SetOFlag: + pass.WriteVariable(OverflowFlagTag{}, block, inst.Arg(0)); + break; + case IR::Opcode::GetRegister: + if (const IR::Reg reg{inst.Arg(0).Reg()}; reg != IR::Reg::RZ) { + inst.ReplaceUsesWith(pass.ReadVariable(reg, block)); + } + break; + case IR::Opcode::GetPred: + if (const IR::Pred pred{inst.Arg(0).Pred()}; pred != IR::Pred::PT) { + inst.ReplaceUsesWith(pass.ReadVariable(pred, block)); + } + break; + case IR::Opcode::GetGotoVariable: + inst.ReplaceUsesWith(pass.ReadVariable(GotoVariable{inst.Arg(0).U32()}, block)); + break; + case IR::Opcode::GetIndirectBranchVariable: + inst.ReplaceUsesWith(pass.ReadVariable(IndirectBranchVariable{}, block)); + break; + case IR::Opcode::GetZFlag: + inst.ReplaceUsesWith(pass.ReadVariable(ZeroFlagTag{}, block)); + break; + case IR::Opcode::GetSFlag: + inst.ReplaceUsesWith(pass.ReadVariable(SignFlagTag{}, block)); + break; + case IR::Opcode::GetCFlag: + inst.ReplaceUsesWith(pass.ReadVariable(CarryFlagTag{}, block)); + break; + case IR::Opcode::GetOFlag: + inst.ReplaceUsesWith(pass.ReadVariable(OverflowFlagTag{}, block)); + break; + default: + break; + } +} + +void VisitBlock(Pass& pass, IR::Block* block) { + for (IR::Inst& inst : block->Instructions()) { + VisitInst(pass, block, inst); + } + pass.SealBlock(block); +} +} // Anonymous namespace + +void SsaRewritePass(IR::Program& program) { + Pass pass; + const auto end{program.post_order_blocks.rend()}; + for (auto block = program.post_order_blocks.rbegin(); block != end; ++block) { + VisitBlock(pass, *block); + } +} + +} // namespace Shader::Optimization diff --git a/src/shader_recompiler/ir_opt/texture_pass.cpp b/src/shader_recompiler/ir_opt/texture_pass.cpp new file mode 100644 index 000000000..44ad10d43 --- /dev/null +++ b/src/shader_recompiler/ir_opt/texture_pass.cpp @@ -0,0 +1,523 @@ +// Copyright 2021 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <algorithm> +#include <bit> +#include <optional> + +#include <boost/container/small_vector.hpp> + +#include "shader_recompiler/environment.h" +#include "shader_recompiler/frontend/ir/basic_block.h" +#include "shader_recompiler/frontend/ir/breadth_first_search.h" +#include "shader_recompiler/frontend/ir/ir_emitter.h" +#include "shader_recompiler/ir_opt/passes.h" +#include "shader_recompiler/shader_info.h" + +namespace Shader::Optimization { +namespace { +struct ConstBufferAddr { + u32 index; + u32 offset; + u32 secondary_index; + u32 secondary_offset; + IR::U32 dynamic_offset; + u32 count; + bool has_secondary; +}; + +struct TextureInst { + ConstBufferAddr cbuf; + IR::Inst* inst; + IR::Block* block; +}; + +using TextureInstVector = boost::container::small_vector<TextureInst, 24>; + +constexpr u32 DESCRIPTOR_SIZE = 8; +constexpr u32 DESCRIPTOR_SIZE_SHIFT = static_cast<u32>(std::countr_zero(DESCRIPTOR_SIZE)); + +IR::Opcode IndexedInstruction(const IR::Inst& inst) { + switch (inst.GetOpcode()) { + case IR::Opcode::BindlessImageSampleImplicitLod: + case IR::Opcode::BoundImageSampleImplicitLod: + return IR::Opcode::ImageSampleImplicitLod; + case IR::Opcode::BoundImageSampleExplicitLod: + case IR::Opcode::BindlessImageSampleExplicitLod: + return IR::Opcode::ImageSampleExplicitLod; + case IR::Opcode::BoundImageSampleDrefImplicitLod: + case IR::Opcode::BindlessImageSampleDrefImplicitLod: + return IR::Opcode::ImageSampleDrefImplicitLod; + case IR::Opcode::BoundImageSampleDrefExplicitLod: + case IR::Opcode::BindlessImageSampleDrefExplicitLod: + return IR::Opcode::ImageSampleDrefExplicitLod; + case IR::Opcode::BindlessImageGather: + case IR::Opcode::BoundImageGather: + return IR::Opcode::ImageGather; + case IR::Opcode::BindlessImageGatherDref: + case IR::Opcode::BoundImageGatherDref: + return IR::Opcode::ImageGatherDref; + case IR::Opcode::BindlessImageFetch: + case IR::Opcode::BoundImageFetch: + return IR::Opcode::ImageFetch; + case IR::Opcode::BoundImageQueryDimensions: + case IR::Opcode::BindlessImageQueryDimensions: + return IR::Opcode::ImageQueryDimensions; + case IR::Opcode::BoundImageQueryLod: + case IR::Opcode::BindlessImageQueryLod: + return IR::Opcode::ImageQueryLod; + case IR::Opcode::BoundImageGradient: + case IR::Opcode::BindlessImageGradient: + return IR::Opcode::ImageGradient; + case IR::Opcode::BoundImageRead: + case IR::Opcode::BindlessImageRead: + return IR::Opcode::ImageRead; + case IR::Opcode::BoundImageWrite: + case IR::Opcode::BindlessImageWrite: + return IR::Opcode::ImageWrite; + case IR::Opcode::BoundImageAtomicIAdd32: + case IR::Opcode::BindlessImageAtomicIAdd32: + return IR::Opcode::ImageAtomicIAdd32; + case IR::Opcode::BoundImageAtomicSMin32: + case IR::Opcode::BindlessImageAtomicSMin32: + return IR::Opcode::ImageAtomicSMin32; + case IR::Opcode::BoundImageAtomicUMin32: + case IR::Opcode::BindlessImageAtomicUMin32: + return IR::Opcode::ImageAtomicUMin32; + case IR::Opcode::BoundImageAtomicSMax32: + case IR::Opcode::BindlessImageAtomicSMax32: + return IR::Opcode::ImageAtomicSMax32; + case IR::Opcode::BoundImageAtomicUMax32: + case IR::Opcode::BindlessImageAtomicUMax32: + return IR::Opcode::ImageAtomicUMax32; + case IR::Opcode::BoundImageAtomicInc32: + case IR::Opcode::BindlessImageAtomicInc32: + return IR::Opcode::ImageAtomicInc32; + case IR::Opcode::BoundImageAtomicDec32: + case IR::Opcode::BindlessImageAtomicDec32: + return IR::Opcode::ImageAtomicDec32; + case IR::Opcode::BoundImageAtomicAnd32: + case IR::Opcode::BindlessImageAtomicAnd32: + return IR::Opcode::ImageAtomicAnd32; + case IR::Opcode::BoundImageAtomicOr32: + case IR::Opcode::BindlessImageAtomicOr32: + return IR::Opcode::ImageAtomicOr32; + case IR::Opcode::BoundImageAtomicXor32: + case IR::Opcode::BindlessImageAtomicXor32: + return IR::Opcode::ImageAtomicXor32; + case IR::Opcode::BoundImageAtomicExchange32: + case IR::Opcode::BindlessImageAtomicExchange32: + return IR::Opcode::ImageAtomicExchange32; + default: + return IR::Opcode::Void; + } +} + +bool IsBindless(const IR::Inst& inst) { + switch (inst.GetOpcode()) { + case IR::Opcode::BindlessImageSampleImplicitLod: + case IR::Opcode::BindlessImageSampleExplicitLod: + case IR::Opcode::BindlessImageSampleDrefImplicitLod: + case IR::Opcode::BindlessImageSampleDrefExplicitLod: + case IR::Opcode::BindlessImageGather: + case IR::Opcode::BindlessImageGatherDref: + case IR::Opcode::BindlessImageFetch: + case IR::Opcode::BindlessImageQueryDimensions: + case IR::Opcode::BindlessImageQueryLod: + case IR::Opcode::BindlessImageGradient: + case IR::Opcode::BindlessImageRead: + case IR::Opcode::BindlessImageWrite: + case IR::Opcode::BindlessImageAtomicIAdd32: + case IR::Opcode::BindlessImageAtomicSMin32: + case IR::Opcode::BindlessImageAtomicUMin32: + case IR::Opcode::BindlessImageAtomicSMax32: + case IR::Opcode::BindlessImageAtomicUMax32: + case IR::Opcode::BindlessImageAtomicInc32: + case IR::Opcode::BindlessImageAtomicDec32: + case IR::Opcode::BindlessImageAtomicAnd32: + case IR::Opcode::BindlessImageAtomicOr32: + case IR::Opcode::BindlessImageAtomicXor32: + case IR::Opcode::BindlessImageAtomicExchange32: + return true; + case IR::Opcode::BoundImageSampleImplicitLod: + case IR::Opcode::BoundImageSampleExplicitLod: + case IR::Opcode::BoundImageSampleDrefImplicitLod: + case IR::Opcode::BoundImageSampleDrefExplicitLod: + case IR::Opcode::BoundImageGather: + case IR::Opcode::BoundImageGatherDref: + case IR::Opcode::BoundImageFetch: + case IR::Opcode::BoundImageQueryDimensions: + case IR::Opcode::BoundImageQueryLod: + case IR::Opcode::BoundImageGradient: + case IR::Opcode::BoundImageRead: + case IR::Opcode::BoundImageWrite: + case IR::Opcode::BoundImageAtomicIAdd32: + case IR::Opcode::BoundImageAtomicSMin32: + case IR::Opcode::BoundImageAtomicUMin32: + case IR::Opcode::BoundImageAtomicSMax32: + case IR::Opcode::BoundImageAtomicUMax32: + case IR::Opcode::BoundImageAtomicInc32: + case IR::Opcode::BoundImageAtomicDec32: + case IR::Opcode::BoundImageAtomicAnd32: + case IR::Opcode::BoundImageAtomicOr32: + case IR::Opcode::BoundImageAtomicXor32: + case IR::Opcode::BoundImageAtomicExchange32: + return false; + default: + throw InvalidArgument("Invalid opcode {}", inst.GetOpcode()); + } +} + +bool IsTextureInstruction(const IR::Inst& inst) { + return IndexedInstruction(inst) != IR::Opcode::Void; +} + +std::optional<ConstBufferAddr> TryGetConstBuffer(const IR::Inst* inst); + +std::optional<ConstBufferAddr> Track(const IR::Value& value) { + return IR::BreadthFirstSearch(value, TryGetConstBuffer); +} + +std::optional<ConstBufferAddr> TryGetConstBuffer(const IR::Inst* inst) { + switch (inst->GetOpcode()) { + default: + return std::nullopt; + case IR::Opcode::BitwiseOr32: { + std::optional lhs{Track(inst->Arg(0))}; + std::optional rhs{Track(inst->Arg(1))}; + if (!lhs || !rhs) { + return std::nullopt; + } + if (lhs->has_secondary || rhs->has_secondary) { + return std::nullopt; + } + if (lhs->count > 1 || rhs->count > 1) { + return std::nullopt; + } + if (lhs->index > rhs->index || lhs->offset > rhs->offset) { + std::swap(lhs, rhs); + } + return ConstBufferAddr{ + .index = lhs->index, + .offset = lhs->offset, + .secondary_index = rhs->index, + .secondary_offset = rhs->offset, + .dynamic_offset = {}, + .count = 1, + .has_secondary = true, + }; + } + case IR::Opcode::GetCbufU32x2: + case IR::Opcode::GetCbufU32: + break; + } + const IR::Value index{inst->Arg(0)}; + const IR::Value offset{inst->Arg(1)}; + if (!index.IsImmediate()) { + // Reading a bindless texture from variable indices is valid + // but not supported here at the moment + return std::nullopt; + } + if (offset.IsImmediate()) { + return ConstBufferAddr{ + .index = index.U32(), + .offset = offset.U32(), + .secondary_index = 0, + .secondary_offset = 0, + .dynamic_offset = {}, + .count = 1, + .has_secondary = false, + }; + } + IR::Inst* const offset_inst{offset.InstRecursive()}; + if (offset_inst->GetOpcode() != IR::Opcode::IAdd32) { + return std::nullopt; + } + u32 base_offset{}; + IR::U32 dynamic_offset; + if (offset_inst->Arg(0).IsImmediate()) { + base_offset = offset_inst->Arg(0).U32(); + dynamic_offset = IR::U32{offset_inst->Arg(1)}; + } else if (offset_inst->Arg(1).IsImmediate()) { + base_offset = offset_inst->Arg(1).U32(); + dynamic_offset = IR::U32{offset_inst->Arg(0)}; + } else { + return std::nullopt; + } + return ConstBufferAddr{ + .index = index.U32(), + .offset = base_offset, + .secondary_index = 0, + .secondary_offset = 0, + .dynamic_offset = dynamic_offset, + .count = 8, + .has_secondary = false, + }; +} + +TextureInst MakeInst(Environment& env, IR::Block* block, IR::Inst& inst) { + ConstBufferAddr addr; + if (IsBindless(inst)) { + const std::optional<ConstBufferAddr> track_addr{Track(inst.Arg(0))}; + if (!track_addr) { + throw NotImplementedException("Failed to track bindless texture constant buffer"); + } + addr = *track_addr; + } else { + addr = ConstBufferAddr{ + .index = env.TextureBoundBuffer(), + .offset = inst.Arg(0).U32(), + .secondary_index = 0, + .secondary_offset = 0, + .dynamic_offset = {}, + .count = 1, + .has_secondary = false, + }; + } + return TextureInst{ + .cbuf = addr, + .inst = &inst, + .block = block, + }; +} + +TextureType ReadTextureType(Environment& env, const ConstBufferAddr& cbuf) { + const u32 secondary_index{cbuf.has_secondary ? cbuf.secondary_index : cbuf.index}; + const u32 secondary_offset{cbuf.has_secondary ? cbuf.secondary_offset : cbuf.offset}; + const u32 lhs_raw{env.ReadCbufValue(cbuf.index, cbuf.offset)}; + const u32 rhs_raw{env.ReadCbufValue(secondary_index, secondary_offset)}; + return env.ReadTextureType(lhs_raw | rhs_raw); +} + +class Descriptors { +public: + explicit Descriptors(TextureBufferDescriptors& texture_buffer_descriptors_, + ImageBufferDescriptors& image_buffer_descriptors_, + TextureDescriptors& texture_descriptors_, + ImageDescriptors& image_descriptors_) + : texture_buffer_descriptors{texture_buffer_descriptors_}, + image_buffer_descriptors{image_buffer_descriptors_}, + texture_descriptors{texture_descriptors_}, image_descriptors{image_descriptors_} {} + + u32 Add(const TextureBufferDescriptor& desc) { + return Add(texture_buffer_descriptors, desc, [&desc](const auto& existing) { + return desc.cbuf_index == existing.cbuf_index && + desc.cbuf_offset == existing.cbuf_offset && + desc.secondary_cbuf_index == existing.secondary_cbuf_index && + desc.secondary_cbuf_offset == existing.secondary_cbuf_offset && + desc.count == existing.count && desc.size_shift == existing.size_shift && + desc.has_secondary == existing.has_secondary; + }); + } + + u32 Add(const ImageBufferDescriptor& desc) { + const u32 index{Add(image_buffer_descriptors, desc, [&desc](const auto& existing) { + return desc.format == existing.format && desc.cbuf_index == existing.cbuf_index && + desc.cbuf_offset == existing.cbuf_offset && desc.count == existing.count && + desc.size_shift == existing.size_shift; + })}; + image_buffer_descriptors[index].is_written |= desc.is_written; + image_buffer_descriptors[index].is_read |= desc.is_read; + return index; + } + + u32 Add(const TextureDescriptor& desc) { + return Add(texture_descriptors, desc, [&desc](const auto& existing) { + return desc.type == existing.type && desc.is_depth == existing.is_depth && + desc.has_secondary == existing.has_secondary && + desc.cbuf_index == existing.cbuf_index && + desc.cbuf_offset == existing.cbuf_offset && + desc.secondary_cbuf_index == existing.secondary_cbuf_index && + desc.secondary_cbuf_offset == existing.secondary_cbuf_offset && + desc.count == existing.count && desc.size_shift == existing.size_shift; + }); + } + + u32 Add(const ImageDescriptor& desc) { + const u32 index{Add(image_descriptors, desc, [&desc](const auto& existing) { + return desc.type == existing.type && desc.format == existing.format && + desc.cbuf_index == existing.cbuf_index && + desc.cbuf_offset == existing.cbuf_offset && desc.count == existing.count && + desc.size_shift == existing.size_shift; + })}; + image_descriptors[index].is_written |= desc.is_written; + image_descriptors[index].is_read |= desc.is_read; + return index; + } + +private: + template <typename Descriptors, typename Descriptor, typename Func> + static u32 Add(Descriptors& descriptors, const Descriptor& desc, Func&& pred) { + // TODO: Handle arrays + const auto it{std::ranges::find_if(descriptors, pred)}; + if (it != descriptors.end()) { + return static_cast<u32>(std::distance(descriptors.begin(), it)); + } + descriptors.push_back(desc); + return static_cast<u32>(descriptors.size()) - 1; + } + + TextureBufferDescriptors& texture_buffer_descriptors; + ImageBufferDescriptors& image_buffer_descriptors; + TextureDescriptors& texture_descriptors; + ImageDescriptors& image_descriptors; +}; +} // Anonymous namespace + +void TexturePass(Environment& env, IR::Program& program) { + TextureInstVector to_replace; + for (IR::Block* const block : program.post_order_blocks) { + for (IR::Inst& inst : block->Instructions()) { + if (!IsTextureInstruction(inst)) { + continue; + } + to_replace.push_back(MakeInst(env, block, inst)); + } + } + // Sort instructions to visit textures by constant buffer index, then by offset + std::ranges::sort(to_replace, [](const auto& lhs, const auto& rhs) { + return lhs.cbuf.offset < rhs.cbuf.offset; + }); + std::stable_sort(to_replace.begin(), to_replace.end(), [](const auto& lhs, const auto& rhs) { + return lhs.cbuf.index < rhs.cbuf.index; + }); + Descriptors descriptors{ + program.info.texture_buffer_descriptors, + program.info.image_buffer_descriptors, + program.info.texture_descriptors, + program.info.image_descriptors, + }; + for (TextureInst& texture_inst : to_replace) { + // TODO: Handle arrays + IR::Inst* const inst{texture_inst.inst}; + inst->ReplaceOpcode(IndexedInstruction(*inst)); + + const auto& cbuf{texture_inst.cbuf}; + auto flags{inst->Flags<IR::TextureInstInfo>()}; + switch (inst->GetOpcode()) { + case IR::Opcode::ImageQueryDimensions: + flags.type.Assign(ReadTextureType(env, cbuf)); + inst->SetFlags(flags); + break; + case IR::Opcode::ImageFetch: + if (flags.type != TextureType::Color1D) { + break; + } + if (ReadTextureType(env, cbuf) == TextureType::Buffer) { + // Replace with the bound texture type only when it's a texture buffer + // If the instruction is 1D and the bound type is 2D, don't change the code and let + // the rasterizer robustness handle it + // This happens on Fire Emblem: Three Houses + flags.type.Assign(TextureType::Buffer); + } + break; + default: + break; + } + u32 index; + switch (inst->GetOpcode()) { + case IR::Opcode::ImageRead: + case IR::Opcode::ImageAtomicIAdd32: + case IR::Opcode::ImageAtomicSMin32: + case IR::Opcode::ImageAtomicUMin32: + case IR::Opcode::ImageAtomicSMax32: + case IR::Opcode::ImageAtomicUMax32: + case IR::Opcode::ImageAtomicInc32: + case IR::Opcode::ImageAtomicDec32: + case IR::Opcode::ImageAtomicAnd32: + case IR::Opcode::ImageAtomicOr32: + case IR::Opcode::ImageAtomicXor32: + case IR::Opcode::ImageAtomicExchange32: + case IR::Opcode::ImageWrite: { + if (cbuf.has_secondary) { + throw NotImplementedException("Unexpected separate sampler"); + } + const bool is_written{inst->GetOpcode() != IR::Opcode::ImageRead}; + const bool is_read{inst->GetOpcode() != IR::Opcode::ImageWrite}; + if (flags.type == TextureType::Buffer) { + index = descriptors.Add(ImageBufferDescriptor{ + .format = flags.image_format, + .is_written = is_written, + .is_read = is_read, + .cbuf_index = cbuf.index, + .cbuf_offset = cbuf.offset, + .count = cbuf.count, + .size_shift = DESCRIPTOR_SIZE_SHIFT, + }); + } else { + index = descriptors.Add(ImageDescriptor{ + .type = flags.type, + .format = flags.image_format, + .is_written = is_written, + .is_read = is_read, + .cbuf_index = cbuf.index, + .cbuf_offset = cbuf.offset, + .count = cbuf.count, + .size_shift = DESCRIPTOR_SIZE_SHIFT, + }); + } + break; + } + default: + if (flags.type == TextureType::Buffer) { + index = descriptors.Add(TextureBufferDescriptor{ + .has_secondary = cbuf.has_secondary, + .cbuf_index = cbuf.index, + .cbuf_offset = cbuf.offset, + .secondary_cbuf_index = cbuf.secondary_index, + .secondary_cbuf_offset = cbuf.secondary_offset, + .count = cbuf.count, + .size_shift = DESCRIPTOR_SIZE_SHIFT, + }); + } else { + index = descriptors.Add(TextureDescriptor{ + .type = flags.type, + .is_depth = flags.is_depth != 0, + .has_secondary = cbuf.has_secondary, + .cbuf_index = cbuf.index, + .cbuf_offset = cbuf.offset, + .secondary_cbuf_index = cbuf.secondary_index, + .secondary_cbuf_offset = cbuf.secondary_offset, + .count = cbuf.count, + .size_shift = DESCRIPTOR_SIZE_SHIFT, + }); + } + break; + } + flags.descriptor_index.Assign(index); + inst->SetFlags(flags); + + if (cbuf.count > 1) { + const auto insert_point{IR::Block::InstructionList::s_iterator_to(*inst)}; + IR::IREmitter ir{*texture_inst.block, insert_point}; + const IR::U32 shift{ir.Imm32(std::countr_zero(DESCRIPTOR_SIZE))}; + inst->SetArg(0, ir.ShiftRightArithmetic(cbuf.dynamic_offset, shift)); + } else { + inst->SetArg(0, IR::Value{}); + } + } +} + +void JoinTextureInfo(Info& base, Info& source) { + Descriptors descriptors{ + base.texture_buffer_descriptors, + base.image_buffer_descriptors, + base.texture_descriptors, + base.image_descriptors, + }; + for (auto& desc : source.texture_buffer_descriptors) { + descriptors.Add(desc); + } + for (auto& desc : source.image_buffer_descriptors) { + descriptors.Add(desc); + } + for (auto& desc : source.texture_descriptors) { + descriptors.Add(desc); + } + for (auto& desc : source.image_descriptors) { + descriptors.Add(desc); + } +} + +} // namespace Shader::Optimization diff --git a/src/shader_recompiler/ir_opt/verification_pass.cpp b/src/shader_recompiler/ir_opt/verification_pass.cpp new file mode 100644 index 000000000..975d5aadf --- /dev/null +++ b/src/shader_recompiler/ir_opt/verification_pass.cpp @@ -0,0 +1,98 @@ +// Copyright 2021 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <map> +#include <set> + +#include "shader_recompiler/exception.h" +#include "shader_recompiler/frontend/ir/basic_block.h" +#include "shader_recompiler/frontend/ir/value.h" +#include "shader_recompiler/ir_opt/passes.h" + +namespace Shader::Optimization { + +static void ValidateTypes(const IR::Program& program) { + for (const auto& block : program.blocks) { + for (const IR::Inst& inst : *block) { + if (inst.GetOpcode() == IR::Opcode::Phi) { + // Skip validation on phi nodes + continue; + } + const size_t num_args{inst.NumArgs()}; + for (size_t i = 0; i < num_args; ++i) { + const IR::Type t1{inst.Arg(i).Type()}; + const IR::Type t2{IR::ArgTypeOf(inst.GetOpcode(), i)}; + if (!IR::AreTypesCompatible(t1, t2)) { + throw LogicError("Invalid types in block:\n{}", IR::DumpBlock(*block)); + } + } + } + } +} + +static void ValidateUses(const IR::Program& program) { + std::map<IR::Inst*, int> actual_uses; + for (const auto& block : program.blocks) { + for (const IR::Inst& inst : *block) { + const size_t num_args{inst.NumArgs()}; + for (size_t i = 0; i < num_args; ++i) { + const IR::Value arg{inst.Arg(i)}; + if (!arg.IsImmediate()) { + ++actual_uses[arg.Inst()]; + } + } + } + } + for (const auto [inst, uses] : actual_uses) { + if (inst->UseCount() != uses) { + throw LogicError("Invalid uses in block: {}", IR::DumpProgram(program)); + } + } +} + +static void ValidateForwardDeclarations(const IR::Program& program) { + std::set<const IR::Inst*> definitions; + for (const IR::Block* const block : program.blocks) { + for (const IR::Inst& inst : *block) { + definitions.emplace(&inst); + if (inst.GetOpcode() == IR::Opcode::Phi) { + // Phi nodes can have forward declarations + continue; + } + const size_t num_args{inst.NumArgs()}; + for (size_t arg = 0; arg < num_args; ++arg) { + if (inst.Arg(arg).IsImmediate()) { + continue; + } + if (!definitions.contains(inst.Arg(arg).Inst())) { + throw LogicError("Forward declaration in block: {}", IR::DumpBlock(*block)); + } + } + } + } +} + +static void ValidatePhiNodes(const IR::Program& program) { + for (const IR::Block* const block : program.blocks) { + bool no_more_phis{false}; + for (const IR::Inst& inst : *block) { + if (inst.GetOpcode() == IR::Opcode::Phi) { + if (no_more_phis) { + throw LogicError("Interleaved phi nodes: {}", IR::DumpBlock(*block)); + } + } else { + no_more_phis = true; + } + } + } +} + +void VerificationPass(const IR::Program& program) { + ValidateTypes(program); + ValidateUses(program); + ValidateForwardDeclarations(program); + ValidatePhiNodes(program); +} + +} // namespace Shader::Optimization |