diff options
Diffstat (limited to 'src/video_core/engines')
-rw-r--r-- | src/video_core/engines/fermi_2d.cpp | 9 | ||||
-rw-r--r-- | src/video_core/engines/kepler_memory.cpp | 11 | ||||
-rw-r--r-- | src/video_core/engines/kepler_memory.h | 7 | ||||
-rw-r--r-- | src/video_core/engines/maxwell_3d.h | 5 | ||||
-rw-r--r-- | src/video_core/engines/maxwell_dma.cpp | 73 | ||||
-rw-r--r-- | src/video_core/engines/maxwell_dma.h | 8 | ||||
-rw-r--r-- | src/video_core/engines/shader_bytecode.h | 151 |
7 files changed, 240 insertions, 24 deletions
diff --git a/src/video_core/engines/fermi_2d.cpp b/src/video_core/engines/fermi_2d.cpp index 597b279b9..74e44c7fe 100644 --- a/src/video_core/engines/fermi_2d.cpp +++ b/src/video_core/engines/fermi_2d.cpp @@ -47,9 +47,12 @@ void Fermi2D::HandleSurfaceCopy() { u32 dst_bytes_per_pixel = RenderTargetBytesPerPixel(regs.dst.format); if (!rasterizer.AccelerateSurfaceCopy(regs.src, regs.dst)) { - // TODO(bunnei): The below implementation currently will not get hit, as - // AccelerateSurfaceCopy tries to always copy and will always return success. This should be - // changed once we properly support flushing. + rasterizer.FlushRegion(source_cpu, src_bytes_per_pixel * regs.src.width * regs.src.height); + // We have to invalidate the destination region to evict any outdated surfaces from the + // cache. We do this before actually writing the new data because the destination address + // might contain a dirty surface that will have to be written back to memory. + rasterizer.InvalidateRegion(dest_cpu, + dst_bytes_per_pixel * regs.dst.width * regs.dst.height); if (regs.src.linear == regs.dst.linear) { // If the input layout and the output layout are the same, just perform a raw copy. diff --git a/src/video_core/engines/kepler_memory.cpp b/src/video_core/engines/kepler_memory.cpp index 66ae6332d..585290d9f 100644 --- a/src/video_core/engines/kepler_memory.cpp +++ b/src/video_core/engines/kepler_memory.cpp @@ -5,10 +5,14 @@ #include "common/logging/log.h" #include "core/memory.h" #include "video_core/engines/kepler_memory.h" +#include "video_core/rasterizer_interface.h" namespace Tegra::Engines { -KeplerMemory::KeplerMemory(MemoryManager& memory_manager) : memory_manager(memory_manager) {} +KeplerMemory::KeplerMemory(VideoCore::RasterizerInterface& rasterizer, + MemoryManager& memory_manager) + : memory_manager(memory_manager), rasterizer{rasterizer} {} + KeplerMemory::~KeplerMemory() = default; void KeplerMemory::WriteReg(u32 method, u32 value) { @@ -37,6 +41,11 @@ void KeplerMemory::ProcessData(u32 data) { VAddr dest_address = *memory_manager.GpuToCpuAddress(address + state.write_offset * sizeof(u32)); + // We have to invalidate the destination region to evict any outdated surfaces from the cache. + // We do this before actually writing the new data because the destination address might contain + // a dirty surface that will have to be written back to memory. + rasterizer.InvalidateRegion(dest_address, sizeof(u32)); + Memory::Write32(dest_address, data); state.write_offset++; diff --git a/src/video_core/engines/kepler_memory.h b/src/video_core/engines/kepler_memory.h index b0d0078cf..bf4a13cff 100644 --- a/src/video_core/engines/kepler_memory.h +++ b/src/video_core/engines/kepler_memory.h @@ -11,6 +11,10 @@ #include "common/common_types.h" #include "video_core/memory_manager.h" +namespace VideoCore { +class RasterizerInterface; +} + namespace Tegra::Engines { #define KEPLERMEMORY_REG_INDEX(field_name) \ @@ -18,7 +22,7 @@ namespace Tegra::Engines { class KeplerMemory final { public: - KeplerMemory(MemoryManager& memory_manager); + KeplerMemory(VideoCore::RasterizerInterface& rasterizer, MemoryManager& memory_manager); ~KeplerMemory(); /// Write the value to the register identified by method. @@ -72,6 +76,7 @@ public: private: MemoryManager& memory_manager; + VideoCore::RasterizerInterface& rasterizer; void ProcessData(u32 data); }; diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h index c8d1b6478..c8af1c6b6 100644 --- a/src/video_core/engines/maxwell_3d.h +++ b/src/video_core/engines/maxwell_3d.h @@ -448,7 +448,10 @@ public: BitField<8, 3, u32> block_depth; BitField<12, 1, InvMemoryLayout> type; } memory_layout; - u32 array_mode; + union { + BitField<0, 16, u32> array_mode; + BitField<16, 1, u32> volume; + }; u32 layer_stride; u32 base_layer; INSERT_PADDING_WORDS(7); diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp index bf2a21bb6..103cd110e 100644 --- a/src/video_core/engines/maxwell_dma.cpp +++ b/src/video_core/engines/maxwell_dma.cpp @@ -4,12 +4,14 @@ #include "core/memory.h" #include "video_core/engines/maxwell_dma.h" +#include "video_core/rasterizer_interface.h" #include "video_core/textures/decoders.h" namespace Tegra { namespace Engines { -MaxwellDMA::MaxwellDMA(MemoryManager& memory_manager) : memory_manager(memory_manager) {} +MaxwellDMA::MaxwellDMA(VideoCore::RasterizerInterface& rasterizer, MemoryManager& memory_manager) + : memory_manager(memory_manager), rasterizer{rasterizer} {} void MaxwellDMA::WriteReg(u32 method, u32 value) { ASSERT_MSG(method < Regs::NUM_REGS, @@ -44,38 +46,79 @@ void MaxwellDMA::HandleCopy() { ASSERT(regs.exec.query_mode == Regs::QueryMode::None); ASSERT(regs.exec.query_intr == Regs::QueryIntr::None); ASSERT(regs.exec.copy_mode == Regs::CopyMode::Unk2); - ASSERT(regs.src_params.pos_x == 0); - ASSERT(regs.src_params.pos_y == 0); ASSERT(regs.dst_params.pos_x == 0); ASSERT(regs.dst_params.pos_y == 0); - if (regs.exec.is_dst_linear == regs.exec.is_src_linear) { - std::size_t copy_size = regs.x_count; + if (!regs.exec.is_dst_linear && !regs.exec.is_src_linear) { + // If both the source and the destination are in block layout, assert. + UNREACHABLE_MSG("Tiled->Tiled DMA transfers are not yet implemented"); + return; + } + if (regs.exec.is_dst_linear && regs.exec.is_src_linear) { // When the enable_2d bit is disabled, the copy is performed as if we were copying a 1D - // buffer of length `x_count`, otherwise we copy a 2D buffer of size (x_count, y_count). - if (regs.exec.enable_2d) { - copy_size = copy_size * regs.y_count; + // buffer of length `x_count`, otherwise we copy a 2D image of dimensions (x_count, + // y_count). + if (!regs.exec.enable_2d) { + Memory::CopyBlock(dest_cpu, source_cpu, regs.x_count); + return; } - Memory::CopyBlock(dest_cpu, source_cpu, copy_size); + // If both the source and the destination are in linear layout, perform a line-by-line + // copy. We're going to take a subrect of size (x_count, y_count) from the source + // rectangle. There is no need to manually flush/invalidate the regions because + // CopyBlock does that for us. + for (u32 line = 0; line < regs.y_count; ++line) { + const VAddr source_line = source_cpu + line * regs.src_pitch; + const VAddr dest_line = dest_cpu + line * regs.dst_pitch; + Memory::CopyBlock(dest_line, source_line, regs.x_count); + } return; } ASSERT(regs.exec.enable_2d == 1); + + std::size_t copy_size = regs.x_count * regs.y_count; + + const auto FlushAndInvalidate = [&](u32 src_size, u32 dst_size) { + // TODO(Subv): For now, manually flush the regions until we implement GPU-accelerated + // copying. + rasterizer.FlushRegion(source_cpu, src_size); + + // We have to invalidate the destination region to evict any outdated surfaces from the + // cache. We do this before actually writing the new data because the destination address + // might contain a dirty surface that will have to be written back to memory. + rasterizer.InvalidateRegion(dest_cpu, dst_size); + }; + u8* src_buffer = Memory::GetPointer(source_cpu); u8* dst_buffer = Memory::GetPointer(dest_cpu); if (regs.exec.is_dst_linear && !regs.exec.is_src_linear) { + ASSERT(regs.src_params.size_z == 1); // If the input is tiled and the output is linear, deswizzle the input and copy it over. - Texture::CopySwizzledData(regs.src_params.size_x, regs.src_params.size_y, - regs.src_params.size_z, 1, 1, src_buffer, dst_buffer, true, - regs.src_params.BlockHeight(), regs.src_params.BlockDepth()); + + u32 src_bytes_per_pixel = regs.src_pitch / regs.src_params.size_x; + + FlushAndInvalidate(regs.src_pitch * regs.src_params.size_y, + copy_size * src_bytes_per_pixel); + + Texture::UnswizzleSubrect(regs.x_count, regs.y_count, regs.dst_pitch, + regs.src_params.size_x, src_bytes_per_pixel, source_cpu, dest_cpu, + regs.src_params.BlockHeight(), regs.src_params.pos_x, + regs.src_params.pos_y); } else { + ASSERT(regs.dst_params.size_z == 1); + ASSERT(regs.src_pitch == regs.x_count); + + u32 src_bpp = regs.src_pitch / regs.x_count; + + FlushAndInvalidate(regs.src_pitch * regs.y_count, + regs.dst_params.size_x * regs.dst_params.size_y * src_bpp); + // If the input is linear and the output is tiled, swizzle the input and copy it over. - Texture::CopySwizzledData(regs.dst_params.size_x, regs.dst_params.size_y, - regs.dst_params.size_z, 1, 1, dst_buffer, src_buffer, false, - regs.dst_params.BlockHeight(), regs.dst_params.BlockDepth()); + Texture::SwizzleSubrect(regs.x_count, regs.y_count, regs.src_pitch, regs.dst_params.size_x, + src_bpp, dest_cpu, source_cpu, regs.dst_params.BlockHeight()); } } diff --git a/src/video_core/engines/maxwell_dma.h b/src/video_core/engines/maxwell_dma.h index df19e02e2..5f3704f05 100644 --- a/src/video_core/engines/maxwell_dma.h +++ b/src/video_core/engines/maxwell_dma.h @@ -12,11 +12,15 @@ #include "video_core/gpu.h" #include "video_core/memory_manager.h" +namespace VideoCore { +class RasterizerInterface; +} + namespace Tegra::Engines { class MaxwellDMA final { public: - explicit MaxwellDMA(MemoryManager& memory_manager); + explicit MaxwellDMA(VideoCore::RasterizerInterface& rasterizer, MemoryManager& memory_manager); ~MaxwellDMA() = default; /// Write the value to the register identified by method. @@ -133,6 +137,8 @@ public: MemoryManager& memory_manager; private: + VideoCore::RasterizerInterface& rasterizer; + /// Performs the copy from the source buffer to the destination buffer as configured in the /// registers. void HandleCopy(); diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h index 8c8d65769..ac50bb622 100644 --- a/src/video_core/engines/shader_bytecode.h +++ b/src/video_core/engines/shader_bytecode.h @@ -335,6 +335,26 @@ enum class IsberdMode : u64 { enum class IsberdShift : u64 { None = 0, U16 = 1, B32 = 2 }; +enum class HalfType : u64 { + H0_H1 = 0, + F32 = 1, + H0_H0 = 2, + H1_H1 = 3, +}; + +enum class HalfMerge : u64 { + H0_H1 = 0, + F32 = 1, + Mrg_H0 = 2, + Mrg_H1 = 3, +}; + +enum class HalfPrecision : u64 { + None = 0, + FTZ = 1, + FMZ = 2, +}; + enum class IpaInterpMode : u64 { Linear = 0, Perspective = 1, @@ -544,6 +564,10 @@ union Instruction { } fmul; union { + BitField<55, 1, u64> saturate; + } fmul32; + + union { BitField<48, 1, u64> is_signed; } shift; @@ -554,6 +578,70 @@ union Instruction { } alu_integer; union { + BitField<39, 1, u64> ftz; + BitField<32, 1, u64> saturate; + BitField<49, 2, HalfMerge> merge; + + BitField<43, 1, u64> negate_a; + BitField<44, 1, u64> abs_a; + BitField<47, 2, HalfType> type_a; + + BitField<31, 1, u64> negate_b; + BitField<30, 1, u64> abs_b; + BitField<47, 2, HalfType> type_b; + + BitField<35, 2, HalfType> type_c; + } alu_half; + + union { + BitField<39, 2, HalfPrecision> precision; + BitField<39, 1, u64> ftz; + BitField<52, 1, u64> saturate; + BitField<49, 2, HalfMerge> merge; + + BitField<43, 1, u64> negate_a; + BitField<44, 1, u64> abs_a; + BitField<47, 2, HalfType> type_a; + } alu_half_imm; + + union { + BitField<29, 1, u64> first_negate; + BitField<20, 9, u64> first; + + BitField<56, 1, u64> second_negate; + BitField<30, 9, u64> second; + + u32 PackImmediates() const { + // Immediates are half floats shifted. + constexpr u32 imm_shift = 6; + return static_cast<u32>((first << imm_shift) | (second << (16 + imm_shift))); + } + } half_imm; + + union { + union { + BitField<37, 2, HalfPrecision> precision; + BitField<32, 1, u64> saturate; + + BitField<30, 1, u64> negate_c; + BitField<35, 2, HalfType> type_c; + } rr; + + BitField<57, 2, HalfPrecision> precision; + BitField<52, 1, u64> saturate; + + BitField<49, 2, HalfMerge> merge; + + BitField<47, 2, HalfType> type_a; + + BitField<56, 1, u64> negate_b; + BitField<28, 2, HalfType> type_b; + + BitField<51, 1, u64> negate_c; + BitField<53, 2, HalfType> type_reg39; + } hfma2; + + union { BitField<40, 1, u64> invert; } popc; @@ -669,7 +757,6 @@ union Instruction { BitField<45, 2, PredOperation> op; BitField<47, 1, u64> ftz; BitField<48, 4, PredCondition> cond; - BitField<56, 1, u64> neg_b; } fsetp; union { @@ -717,6 +804,23 @@ union Instruction { } csetp; union { + BitField<35, 4, PredCondition> cond; + BitField<49, 1, u64> h_and; + BitField<6, 1, u64> ftz; + BitField<45, 2, PredOperation> op; + BitField<3, 3, u64> pred3; + BitField<0, 3, u64> pred0; + BitField<43, 1, u64> negate_a; + BitField<44, 1, u64> abs_a; + BitField<47, 2, HalfType> type_a; + BitField<31, 1, u64> negate_b; + BitField<30, 1, u64> abs_b; + BitField<28, 2, HalfType> type_b; + BitField<42, 1, u64> neg_pred; + BitField<39, 3, u64> pred39; + } hsetp2; + + union { BitField<39, 3, u64> pred39; BitField<42, 1, u64> neg_pred; BitField<43, 1, u64> neg_a; @@ -727,10 +831,24 @@ union Instruction { BitField<53, 1, u64> neg_b; BitField<54, 1, u64> abs_a; BitField<55, 1, u64> ftz; - BitField<56, 1, u64> neg_imm; } fset; union { + BitField<49, 1, u64> bf; + BitField<35, 3, PredCondition> cond; + BitField<50, 1, u64> ftz; + BitField<45, 2, PredOperation> op; + BitField<43, 1, u64> negate_a; + BitField<44, 1, u64> abs_a; + BitField<47, 2, HalfType> type_a; + BitField<31, 1, u64> negate_b; + BitField<30, 1, u64> abs_b; + BitField<28, 2, HalfType> type_b; + BitField<42, 1, u64> neg_pred; + BitField<39, 3, u64> pred39; + } hset2; + + union { BitField<39, 3, u64> pred39; BitField<42, 1, u64> neg_pred; BitField<44, 1, u64> bf; @@ -1147,6 +1265,18 @@ public: LEA_RZ, LEA_IMM, LEA_HI, + HADD2_C, + HADD2_R, + HADD2_IMM, + HMUL2_C, + HMUL2_R, + HMUL2_IMM, + HFMA2_CR, + HFMA2_RC, + HFMA2_RR, + HFMA2_IMM_R, + HSETP2_R, + HSET2_R, POPC_C, POPC_R, POPC_IMM, @@ -1220,9 +1350,12 @@ public: ArithmeticImmediate, ArithmeticInteger, ArithmeticIntegerImmediate, + ArithmeticHalf, + ArithmeticHalfImmediate, Bfe, Shift, Ffma, + Hfma2, Flow, Synch, Memory, @@ -1230,6 +1363,8 @@ public: FloatSetPredicate, IntegerSet, IntegerSetPredicate, + HalfSet, + HalfSetPredicate, PredicateSetPredicate, PredicateSetRegister, Conversion, @@ -1393,6 +1528,18 @@ private: INST("001101101101----", Id::LEA_IMM, Type::ArithmeticInteger, "LEA_IMM"), INST("010010111101----", Id::LEA_RZ, Type::ArithmeticInteger, "LEA_RZ"), INST("00011000--------", Id::LEA_HI, Type::ArithmeticInteger, "LEA_HI"), + INST("0111101-1-------", Id::HADD2_C, Type::ArithmeticHalf, "HADD2_C"), + INST("0101110100010---", Id::HADD2_R, Type::ArithmeticHalf, "HADD2_R"), + INST("0111101-0-------", Id::HADD2_IMM, Type::ArithmeticHalfImmediate, "HADD2_IMM"), + INST("0111100-1-------", Id::HMUL2_C, Type::ArithmeticHalf, "HMUL2_C"), + INST("0101110100001---", Id::HMUL2_R, Type::ArithmeticHalf, "HMUL2_R"), + INST("0111100-0-------", Id::HMUL2_IMM, Type::ArithmeticHalfImmediate, "HMUL2_IMM"), + INST("01110---1-------", Id::HFMA2_CR, Type::Hfma2, "HFMA2_CR"), + INST("01100---1-------", Id::HFMA2_RC, Type::Hfma2, "HFMA2_RC"), + INST("0101110100000---", Id::HFMA2_RR, Type::Hfma2, "HFMA2_RR"), + INST("01110---0-------", Id::HFMA2_IMM_R, Type::Hfma2, "HFMA2_R_IMM"), + INST("0101110100100---", Id::HSETP2_R, Type::HalfSetPredicate, "HSETP_R"), + INST("0101110100011---", Id::HSET2_R, Type::HalfSet, "HSET2_R"), INST("0101000010000---", Id::MUFU, Type::Arithmetic, "MUFU"), INST("0100110010010---", Id::RRO_C, Type::Arithmetic, "RRO_C"), INST("0101110010010---", Id::RRO_R, Type::Arithmetic, "RRO_R"), |